exec("CREATE TABLE IF NOT EXISTS rate_limits ( id INT AUTO_INCREMENT PRIMARY KEY, ip_address VARCHAR(45) NOT NULL, request_timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP )"); // Clean up old records $db->prepare("DELETE FROM rate_limits WHERE request_timestamp < NOW() - INTERVAL 1 DAY")->execute(); // Check minute limit (5 requests) $stmt_minute = $db->prepare("SELECT COUNT(*) FROM rate_limits WHERE ip_address = ? AND request_timestamp > NOW() - INTERVAL 1 MINUTE"); $stmt_minute->execute([$ip_address]); if ($stmt_minute->fetchColumn() >= 5) { http_response_code(429); // Too Many Requests echo json_encode(['error' => 'Too many requests. Please wait a minute before trying again.']); exit; } // Check daily limit (100 requests) $stmt_day = $db->prepare("SELECT COUNT(*) FROM rate_limits WHERE ip_address = ?"); $stmt_day->execute([$ip_address]); if ($stmt_day->fetchColumn() >= 100) { http_response_code(429); echo json_encode(['error' => 'You have reached your daily limit of 100 summaries.']); exit; } } catch (PDOException $e) { http_response_code(500); // Do not expose detailed error to client echo json_encode(['error' => 'Could not connect to the database for rate limiting.']); exit; } // --- Summarization Logic --- $url = isset($_POST['url']) ? trim($_POST['url']) : ''; if (empty($url) || !filter_var($url, FILTER_VALIDATE_URL)) { http_response_code(400); echo json_encode(['error' => 'Invalid or missing URL.']); exit; } $url_hash = md5($url); // Check session for cached summary (valid for 30 minutes) if (isset($_SESSION['summaries'][$url_hash]) && (time() - $_SESSION['summaries'][$url_hash]['timestamp'] < 1800)) { echo json_encode(['summary' => $_SESSION['summaries'][$url_hash]['summary']]); exit; } // If not cached, proceed to fetch and summarize // Log the request for rate limiting $db->prepare("INSERT INTO rate_limits (ip_address) VALUES (?)")->execute([$ip_address]); // 1. Scrape the article content using cURL $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); curl_setopt($ch, CURLOPT_USERAGENT, 'ArtickleSummarizer/1.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'); curl_setopt($ch, CURLOPT_TIMEOUT, 15); $html = curl_exec($ch); $http_code = curl_getinfo($ch, CURLINFO_HTTP_CODE); curl_close($ch); if ($http_code < 200 || $http_code >= 300 || empty($html)) { http_response_code(500); echo json_encode(['error' => 'Failed to fetch article content. The site may be down or blocking requests.']); exit; } // 2. Extract main content (simple heuristic) libxml_use_internal_errors(true); $doc = new DOMDocument(); $doc->loadHTML($html); libxml_clear_errors(); $paragraphs = $doc->getElementsByTagName('p'); $text_content = ""; foreach ($paragraphs as $p) { $text_content .= $p->nodeValue . "\n\n"; } if (trim($text_content) === "") { http_response_code(500); echo json_encode(['error' => 'Could not extract readable content from the article.']); exit; } // 3. Summarize (Placeholder for in-house model) // This is a placeholder function. In a real scenario, this would call an internal summarization model. // For now, it returns the first 5 paragraphs. $sentences = explode("\n\n", $text_content); $summary = implode("\n\n", array_slice($sentences, 0, 5)); // 4. Cache the summary in the session $_SESSION['summaries'][$url_hash] = [ 'summary' => $summary, 'timestamp' => time() ]; echo json_encode(['summary' => $summary]);