40247-vm/api/scan_links.php
2026-06-12 09:15:36 +00:00

269 lines
9.0 KiB
PHP

<?php
declare(strict_types=1);
header('Content-Type: application/json; charset=utf-8');
header('Cache-Control: no-store');
$source = 'https://spireason.neocities.org/';
// Human-curated suppressions: links intentionally excluded from LandScaper even if they remain in source.
// Keep exact normalized URLs here when a source link is deliberately removed from the clone.
$deliberatelyRemoved = [
// Example format: 'https://spireason.neocities.org/removed-example.html',
];
$context = stream_context_create([
'http' => [
'timeout' => 12,
'header' => "User-Agent: Laegna-LandScaper-VisibleLinkScanner/1.1\r\n",
],
]);
$html = @file_get_contents($source, false, $context);
if ($html === false || $html === '') {
http_response_code(502);
echo json_encode(['success' => false, 'error' => 'Could not fetch source page.']);
exit;
}
$links = [];
$seen = [];
$ignoredCount = 0;
$suppressedCount = 0;
if (preg_match_all('~<a\b([^>]*)>(.*?)</a>~is', $html, $matches, PREG_SET_ORDER)) {
foreach ($matches as $match) {
$attrs = $match[1] ?? '';
$inner = $match[2] ?? '';
if (is_hidden_anchor($attrs)) {
$ignoredCount++;
continue;
}
if (!preg_match('~\bhref\s*=\s*(["\'])(.*?)\1~is', $attrs, $hrefMatch)) {
$ignoredCount++;
continue;
}
$href = html_entity_decode(trim($hrefMatch[2]), ENT_QUOTES | ENT_HTML5, 'UTF-8');
$label = visible_label($attrs, $inner);
if ($label === '') {
$ignoredCount++;
continue;
}
$absolute = absolutize_url($href, $source);
$normalized = normalize_for_compare($absolute);
$skipReason = skip_reason($href, $absolute, $normalized, $deliberatelyRemoved);
if ($skipReason === 'suppressed') {
$suppressedCount++;
continue;
}
if ($skipReason !== '') {
$ignoredCount++;
continue;
}
if (isset($seen[$normalized])) {
continue;
}
$seen[$normalized] = true;
$category = categorize_link($absolute, $label);
$host = strtolower((string)(parse_url($absolute, PHP_URL_HOST) ?: ''));
$links[] = [
'text' => unicode_limit($label, 160),
'href' => $absolute,
'host' => $host,
'category' => $category,
'category_label' => category_label($category),
'safety' => safety_class($host),
];
}
}
echo json_encode([
'success' => true,
'source' => $source,
'count' => count($links),
'ignored_count' => $ignoredCount,
'suppressed_count' => $suppressedCount,
'scanned_at' => gmdate('Y-m-d H:i:s') . ' UTC',
'filters' => [
'visible_anchor_text_required' => true,
'skips_schemes' => ['javascript', 'mailto', 'tel', 'data', 'blob'],
'skips_asset_extensions' => ['js', 'mjs', 'css', 'map', 'png', 'jpg', 'jpeg', 'gif', 'svg', 'webp', 'ico', 'woff', 'woff2', 'ttf', 'eot'],
'unknown_external_hosts' => 'included but marked review',
],
'links' => $links,
], JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE);
function is_hidden_anchor(string $attrs): bool
{
if (preg_match('~(?:^|\s)hidden(?:\s|=|$)~i', $attrs)) {
return true;
}
if (preg_match('~\baria-hidden\s*=\s*(["\'])true\1~i', $attrs)) {
return true;
}
if (preg_match('~\bstyle\s*=\s*(["\'])(.*?)\1~is', $attrs, $m)) {
$style = strtolower($m[2]);
return str_contains($style, 'display:none')
|| str_contains($style, 'display: none')
|| str_contains($style, 'visibility:hidden')
|| str_contains($style, 'visibility: hidden')
|| str_contains($style, 'opacity:0')
|| str_contains($style, 'opacity: 0');
}
return false;
}
function visible_label(string $attrs, string $inner): string
{
$cleanInner = preg_replace('~<(script|style)\b[^>]*>.*?</\1>~is', ' ', $inner) ?? $inner;
$text = html_entity_decode(strip_tags($cleanInner), ENT_QUOTES | ENT_HTML5, 'UTF-8');
$text = trim(preg_replace('/\s+/u', ' ', $text) ?? '');
if ($text !== '') {
return $text;
}
foreach ([$attrs, $inner] as $source) {
if (preg_match_all('~\b(?:aria-label|title|alt)\s*=\s*(["\'])(.*?)\1~is', $source, $matches, PREG_SET_ORDER)) {
foreach ($matches as $match) {
$candidate = trim(preg_replace('/\s+/u', ' ', html_entity_decode($match[2], ENT_QUOTES | ENT_HTML5, 'UTF-8')) ?? '');
if ($candidate !== '') {
return $candidate;
}
}
}
}
return '';
}
function skip_reason(string $href, string $absolute, string $normalized, array $deliberatelyRemoved): string
{
$hrefLower = strtolower(trim($href));
if ($hrefLower === '' || $hrefLower[0] === '#') {
return 'fragment';
}
foreach (['javascript:', 'mailto:', 'tel:', 'data:', 'blob:'] as $scheme) {
if (str_starts_with($hrefLower, $scheme)) {
return 'scheme';
}
}
if (!preg_match('~^https?://~i', $absolute)) {
return 'scheme';
}
$path = (string)(parse_url($absolute, PHP_URL_PATH) ?: '');
if (preg_match('~\.(?:js|mjs|css|map|png|jpe?g|gif|svg|webp|ico|woff2?|ttf|eot)(?:$|[?#])~i', $path)) {
return 'asset';
}
foreach ($deliberatelyRemoved as $removed) {
if ($normalized === normalize_for_compare((string)$removed)) {
return 'suppressed';
}
}
return '';
}
function categorize_link(string $url, string $label): string
{
$host = strtolower((string)(parse_url($url, PHP_URL_HOST) ?: ''));
$path = strtolower((string)(parse_url($url, PHP_URL_PATH) ?: ''));
$haystack = strtolower($url . ' ' . $label);
if (str_ends_with($path, '.pdf')) return 'pdf';
if (str_ends_with($path, '.zip')) return 'archive';
if (str_ends_with($path, ".json") || str_contains($path, "numberdatabase")) return "dataset";
if (str_ends_with($path, ".py") || str_contains($haystack, "script")) return "source";
if (str_contains($haystack, 'frequency') || str_contains($haystack, 'frequential') || str_contains($haystack, 'octave') || str_contains($haystack, 'calculator') || str_contains($haystack, 'counter')) return 'tool';
if (str_contains($haystack, 'bot') || str_contains($host, 'perplexity.ai')) return 'bot';
if (str_contains($host, 'github')) return 'github';
if (str_contains($path, 'apples') || str_contains($haystack, 'app')) return 'applet';
if (str_contains($host, 'prezi')) return 'presentation';
if (str_contains($host, 'youtube') || str_contains($host, 'notion')) return 'media';
if ($host !== 'spireason.neocities.org') return 'external';
return 'onsite';
}
function category_label(string $category): string
{
return [
'pdf' => 'PDF text',
'archive' => 'Archive',
"dataset" => "Dataset",
"source" => "Source file",
'tool' => 'Tool',
'bot' => 'External bot/system',
'github' => 'Repository',
'applet' => 'Applet',
'presentation' => 'Presentation',
'media' => 'Media',
'external' => 'External system',
'onsite' => 'On-site branch',
][$category] ?? ucfirst(str_replace('-', ' ', $category));
}
function safety_class(string $host): string
{
if ($host === 'spireason.neocities.org') {
return 'source';
}
$knownHosts = [
'laegna.notaku.site',
'prezi.com',
'www.perplexity.ai',
'huggingface.co',
'assorted-canopy-961.notion.site',
'www.youtube.com',
'youtube.com',
'github.com',
'tambetvali.github.io',
];
foreach ($knownHosts as $knownHost) {
if ($host === $knownHost || str_ends_with($host, '.' . $knownHost)) {
return 'known-external';
}
}
return 'review';
}
function unicode_limit(string $text, int $limit): string
{
if (preg_match_all('/./us', $text, $chars) && count($chars[0]) > $limit) {
return implode('', array_slice($chars[0], 0, $limit - 1)) . '…';
}
return $text;
}
function normalize_for_compare(string $url): string
{
$url = trim($url);
$url = preg_replace('/#$/', '', $url) ?? $url;
return $url;
}
function absolutize_url(string $href, string $base): string
{
$href = trim($href);
if (preg_match('~^https?://~i', $href)) {
return encode_spaces($href);
}
$parts = parse_url($base);
$scheme = $parts['scheme'] ?? 'https';
$host = $parts['host'] ?? 'spireason.neocities.org';
if (str_starts_with($href, '//')) {
return encode_spaces($scheme . ':' . $href);
}
if (str_starts_with($href, '/')) {
return encode_spaces($scheme . '://' . $host . $href);
}
$path = $parts['path'] ?? '/';
$dir = preg_replace('~/[^/]*$~', '/', $path) ?: '/';
return encode_spaces($scheme . '://' . $host . $dir . $href);
}
function encode_spaces(string $url): string
{
return str_replace(' ', '%20', $url);
}