118 lines
3.9 KiB
PHP
118 lines
3.9 KiB
PHP
<?php
|
|
session_start();
|
|
header('Content-Type: application/json');
|
|
|
|
require_once __DIR__ . '/../db/config.php';
|
|
|
|
// --- Rate Limiting ---
|
|
try {
|
|
$db = db();
|
|
$ip_address = $_SERVER['REMOTE_ADDR'];
|
|
|
|
// Create table if it doesn't exist (simple migration)
|
|
$db->exec("CREATE TABLE IF NOT EXISTS rate_limits (
|
|
id INT AUTO_INCREMENT PRIMARY KEY,
|
|
ip_address VARCHAR(45) NOT NULL,
|
|
request_timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
|
)");
|
|
|
|
// Clean up old records
|
|
$db->prepare("DELETE FROM rate_limits WHERE request_timestamp < NOW() - INTERVAL 1 DAY")->execute();
|
|
|
|
// Check minute limit (5 requests)
|
|
$stmt_minute = $db->prepare("SELECT COUNT(*) FROM rate_limits WHERE ip_address = ? AND request_timestamp > NOW() - INTERVAL 1 MINUTE");
|
|
$stmt_minute->execute([$ip_address]);
|
|
if ($stmt_minute->fetchColumn() >= 5) {
|
|
http_response_code(429); // Too Many Requests
|
|
echo json_encode(['error' => 'Too many requests. Please wait a minute before trying again.']);
|
|
exit;
|
|
}
|
|
|
|
// Check daily limit (100 requests)
|
|
$stmt_day = $db->prepare("SELECT COUNT(*) FROM rate_limits WHERE ip_address = ?");
|
|
$stmt_day->execute([$ip_address]);
|
|
if ($stmt_day->fetchColumn() >= 100) {
|
|
http_response_code(429);
|
|
echo json_encode(['error' => 'You have reached your daily limit of 100 summaries.']);
|
|
exit;
|
|
}
|
|
|
|
} catch (PDOException $e) {
|
|
http_response_code(500);
|
|
// Do not expose detailed error to client
|
|
echo json_encode(['error' => 'Could not connect to the database for rate limiting.']);
|
|
exit;
|
|
}
|
|
|
|
|
|
// --- Summarization Logic ---
|
|
$url = isset($_POST['url']) ? trim($_POST['url']) : '';
|
|
|
|
if (empty($url) || !filter_var($url, FILTER_VALIDATE_URL)) {
|
|
http_response_code(400);
|
|
echo json_encode(['error' => 'Invalid or missing URL.']);
|
|
exit;
|
|
}
|
|
|
|
$url_hash = md5($url);
|
|
|
|
// Check session for cached summary (valid for 30 minutes)
|
|
if (isset($_SESSION['summaries'][$url_hash]) && (time() - $_SESSION['summaries'][$url_hash]['timestamp'] < 1800)) {
|
|
echo json_encode(['summary' => $_SESSION['summaries'][$url_hash]['summary']]);
|
|
exit;
|
|
}
|
|
|
|
// If not cached, proceed to fetch and summarize
|
|
// Log the request for rate limiting
|
|
$db->prepare("INSERT INTO rate_limits (ip_address) VALUES (?)")->execute([$ip_address]);
|
|
|
|
|
|
// 1. Scrape the article content using cURL
|
|
$ch = curl_init();
|
|
curl_setopt($ch, CURLOPT_URL, $url);
|
|
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
|
|
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
|
|
curl_setopt($ch, CURLOPT_USERAGENT, 'ArtickleSummarizer/1.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)');
|
|
curl_setopt($ch, CURLOPT_TIMEOUT, 15);
|
|
$html = curl_exec($ch);
|
|
$http_code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
|
|
curl_close($ch);
|
|
|
|
if ($http_code < 200 || $http_code >= 300 || empty($html)) {
|
|
http_response_code(500);
|
|
echo json_encode(['error' => 'Failed to fetch article content. The site may be down or blocking requests.']);
|
|
exit;
|
|
}
|
|
|
|
// 2. Extract main content (simple heuristic)
|
|
libxml_use_internal_errors(true);
|
|
$doc = new DOMDocument();
|
|
$doc->loadHTML($html);
|
|
libxml_clear_errors();
|
|
$paragraphs = $doc->getElementsByTagName('p');
|
|
$text_content = "";
|
|
foreach ($paragraphs as $p) {
|
|
$text_content .= $p->nodeValue . "\n\n";
|
|
}
|
|
|
|
if (trim($text_content) === "") {
|
|
http_response_code(500);
|
|
echo json_encode(['error' => 'Could not extract readable content from the article.']);
|
|
exit;
|
|
}
|
|
|
|
// 3. Summarize (Placeholder for in-house model)
|
|
// This is a placeholder function. In a real scenario, this would call an internal summarization model.
|
|
// For now, it returns the first 5 paragraphs.
|
|
$sentences = explode("\n\n", $text_content);
|
|
$summary = implode("\n\n", array_slice($sentences, 0, 5));
|
|
|
|
// 4. Cache the summary in the session
|
|
$_SESSION['summaries'][$url_hash] = [
|
|
'summary' => $summary,
|
|
'timestamp' => time()
|
|
];
|
|
|
|
echo json_encode(['summary' => $summary]);
|
|
|