35082-vm/api/summarize.php
Flatlogic Bot b113664b26 A-1
2025-10-21 11:59:17 +00:00

118 lines
3.9 KiB
PHP

<?php
session_start();
header('Content-Type: application/json');
require_once __DIR__ . '/../db/config.php';
// --- Rate Limiting ---
try {
$db = db();
$ip_address = $_SERVER['REMOTE_ADDR'];
// Create table if it doesn't exist (simple migration)
$db->exec("CREATE TABLE IF NOT EXISTS rate_limits (
id INT AUTO_INCREMENT PRIMARY KEY,
ip_address VARCHAR(45) NOT NULL,
request_timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)");
// Clean up old records
$db->prepare("DELETE FROM rate_limits WHERE request_timestamp < NOW() - INTERVAL 1 DAY")->execute();
// Check minute limit (5 requests)
$stmt_minute = $db->prepare("SELECT COUNT(*) FROM rate_limits WHERE ip_address = ? AND request_timestamp > NOW() - INTERVAL 1 MINUTE");
$stmt_minute->execute([$ip_address]);
if ($stmt_minute->fetchColumn() >= 5) {
http_response_code(429); // Too Many Requests
echo json_encode(['error' => 'Too many requests. Please wait a minute before trying again.']);
exit;
}
// Check daily limit (100 requests)
$stmt_day = $db->prepare("SELECT COUNT(*) FROM rate_limits WHERE ip_address = ?");
$stmt_day->execute([$ip_address]);
if ($stmt_day->fetchColumn() >= 100) {
http_response_code(429);
echo json_encode(['error' => 'You have reached your daily limit of 100 summaries.']);
exit;
}
} catch (PDOException $e) {
http_response_code(500);
// Do not expose detailed error to client
echo json_encode(['error' => 'Could not connect to the database for rate limiting.']);
exit;
}
// --- Summarization Logic ---
$url = isset($_POST['url']) ? trim($_POST['url']) : '';
if (empty($url) || !filter_var($url, FILTER_VALIDATE_URL)) {
http_response_code(400);
echo json_encode(['error' => 'Invalid or missing URL.']);
exit;
}
$url_hash = md5($url);
// Check session for cached summary (valid for 30 minutes)
if (isset($_SESSION['summaries'][$url_hash]) && (time() - $_SESSION['summaries'][$url_hash]['timestamp'] < 1800)) {
echo json_encode(['summary' => $_SESSION['summaries'][$url_hash]['summary']]);
exit;
}
// If not cached, proceed to fetch and summarize
// Log the request for rate limiting
$db->prepare("INSERT INTO rate_limits (ip_address) VALUES (?)")->execute([$ip_address]);
// 1. Scrape the article content using cURL
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_USERAGENT, 'ArtickleSummarizer/1.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)');
curl_setopt($ch, CURLOPT_TIMEOUT, 15);
$html = curl_exec($ch);
$http_code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
curl_close($ch);
if ($http_code < 200 || $http_code >= 300 || empty($html)) {
http_response_code(500);
echo json_encode(['error' => 'Failed to fetch article content. The site may be down or blocking requests.']);
exit;
}
// 2. Extract main content (simple heuristic)
libxml_use_internal_errors(true);
$doc = new DOMDocument();
$doc->loadHTML($html);
libxml_clear_errors();
$paragraphs = $doc->getElementsByTagName('p');
$text_content = "";
foreach ($paragraphs as $p) {
$text_content .= $p->nodeValue . "\n\n";
}
if (trim($text_content) === "") {
http_response_code(500);
echo json_encode(['error' => 'Could not extract readable content from the article.']);
exit;
}
// 3. Summarize (Placeholder for in-house model)
// This is a placeholder function. In a real scenario, this would call an internal summarization model.
// For now, it returns the first 5 paragraphs.
$sentences = explode("\n\n", $text_content);
$summary = implode("\n\n", array_slice($sentences, 0, 5));
// 4. Cache the summary in the session
$_SESSION['summaries'][$url_hash] = [
'summary' => $summary,
'timestamp' => time()
];
echo json_encode(['summary' => $summary]);