269 lines
9.0 KiB
PHP
269 lines
9.0 KiB
PHP
<?php
|
|
declare(strict_types=1);
|
|
header('Content-Type: application/json; charset=utf-8');
|
|
header('Cache-Control: no-store');
|
|
|
|
$source = 'https://spireason.neocities.org/';
|
|
|
|
// Human-curated suppressions: links intentionally excluded from LandScaper even if they remain in source.
|
|
// Keep exact normalized URLs here when a source link is deliberately removed from the clone.
|
|
$deliberatelyRemoved = [
|
|
// Example format: 'https://spireason.neocities.org/removed-example.html',
|
|
];
|
|
|
|
$context = stream_context_create([
|
|
'http' => [
|
|
'timeout' => 12,
|
|
'header' => "User-Agent: Laegna-LandScaper-VisibleLinkScanner/1.1\r\n",
|
|
],
|
|
]);
|
|
$html = @file_get_contents($source, false, $context);
|
|
if ($html === false || $html === '') {
|
|
http_response_code(502);
|
|
echo json_encode(['success' => false, 'error' => 'Could not fetch source page.']);
|
|
exit;
|
|
}
|
|
|
|
$links = [];
|
|
$seen = [];
|
|
$ignoredCount = 0;
|
|
$suppressedCount = 0;
|
|
|
|
if (preg_match_all('~<a\b([^>]*)>(.*?)</a>~is', $html, $matches, PREG_SET_ORDER)) {
|
|
foreach ($matches as $match) {
|
|
$attrs = $match[1] ?? '';
|
|
$inner = $match[2] ?? '';
|
|
|
|
if (is_hidden_anchor($attrs)) {
|
|
$ignoredCount++;
|
|
continue;
|
|
}
|
|
if (!preg_match('~\bhref\s*=\s*(["\'])(.*?)\1~is', $attrs, $hrefMatch)) {
|
|
$ignoredCount++;
|
|
continue;
|
|
}
|
|
|
|
$href = html_entity_decode(trim($hrefMatch[2]), ENT_QUOTES | ENT_HTML5, 'UTF-8');
|
|
$label = visible_label($attrs, $inner);
|
|
if ($label === '') {
|
|
$ignoredCount++;
|
|
continue;
|
|
}
|
|
|
|
$absolute = absolutize_url($href, $source);
|
|
$normalized = normalize_for_compare($absolute);
|
|
$skipReason = skip_reason($href, $absolute, $normalized, $deliberatelyRemoved);
|
|
if ($skipReason === 'suppressed') {
|
|
$suppressedCount++;
|
|
continue;
|
|
}
|
|
if ($skipReason !== '') {
|
|
$ignoredCount++;
|
|
continue;
|
|
}
|
|
|
|
if (isset($seen[$normalized])) {
|
|
continue;
|
|
}
|
|
$seen[$normalized] = true;
|
|
|
|
$category = categorize_link($absolute, $label);
|
|
$host = strtolower((string)(parse_url($absolute, PHP_URL_HOST) ?: ''));
|
|
$links[] = [
|
|
'text' => unicode_limit($label, 160),
|
|
'href' => $absolute,
|
|
'host' => $host,
|
|
'category' => $category,
|
|
'category_label' => category_label($category),
|
|
'safety' => safety_class($host),
|
|
];
|
|
}
|
|
}
|
|
|
|
echo json_encode([
|
|
'success' => true,
|
|
'source' => $source,
|
|
'count' => count($links),
|
|
'ignored_count' => $ignoredCount,
|
|
'suppressed_count' => $suppressedCount,
|
|
'scanned_at' => gmdate('Y-m-d H:i:s') . ' UTC',
|
|
'filters' => [
|
|
'visible_anchor_text_required' => true,
|
|
'skips_schemes' => ['javascript', 'mailto', 'tel', 'data', 'blob'],
|
|
'skips_asset_extensions' => ['js', 'mjs', 'css', 'map', 'png', 'jpg', 'jpeg', 'gif', 'svg', 'webp', 'ico', 'woff', 'woff2', 'ttf', 'eot'],
|
|
'unknown_external_hosts' => 'included but marked review',
|
|
],
|
|
'links' => $links,
|
|
], JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE);
|
|
|
|
function is_hidden_anchor(string $attrs): bool
|
|
{
|
|
if (preg_match('~(?:^|\s)hidden(?:\s|=|$)~i', $attrs)) {
|
|
return true;
|
|
}
|
|
if (preg_match('~\baria-hidden\s*=\s*(["\'])true\1~i', $attrs)) {
|
|
return true;
|
|
}
|
|
if (preg_match('~\bstyle\s*=\s*(["\'])(.*?)\1~is', $attrs, $m)) {
|
|
$style = strtolower($m[2]);
|
|
return str_contains($style, 'display:none')
|
|
|| str_contains($style, 'display: none')
|
|
|| str_contains($style, 'visibility:hidden')
|
|
|| str_contains($style, 'visibility: hidden')
|
|
|| str_contains($style, 'opacity:0')
|
|
|| str_contains($style, 'opacity: 0');
|
|
}
|
|
return false;
|
|
}
|
|
|
|
function visible_label(string $attrs, string $inner): string
|
|
{
|
|
$cleanInner = preg_replace('~<(script|style)\b[^>]*>.*?</\1>~is', ' ', $inner) ?? $inner;
|
|
$text = html_entity_decode(strip_tags($cleanInner), ENT_QUOTES | ENT_HTML5, 'UTF-8');
|
|
$text = trim(preg_replace('/\s+/u', ' ', $text) ?? '');
|
|
if ($text !== '') {
|
|
return $text;
|
|
}
|
|
|
|
foreach ([$attrs, $inner] as $source) {
|
|
if (preg_match_all('~\b(?:aria-label|title|alt)\s*=\s*(["\'])(.*?)\1~is', $source, $matches, PREG_SET_ORDER)) {
|
|
foreach ($matches as $match) {
|
|
$candidate = trim(preg_replace('/\s+/u', ' ', html_entity_decode($match[2], ENT_QUOTES | ENT_HTML5, 'UTF-8')) ?? '');
|
|
if ($candidate !== '') {
|
|
return $candidate;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return '';
|
|
}
|
|
|
|
function skip_reason(string $href, string $absolute, string $normalized, array $deliberatelyRemoved): string
|
|
{
|
|
$hrefLower = strtolower(trim($href));
|
|
if ($hrefLower === '' || $hrefLower[0] === '#') {
|
|
return 'fragment';
|
|
}
|
|
foreach (['javascript:', 'mailto:', 'tel:', 'data:', 'blob:'] as $scheme) {
|
|
if (str_starts_with($hrefLower, $scheme)) {
|
|
return 'scheme';
|
|
}
|
|
}
|
|
if (!preg_match('~^https?://~i', $absolute)) {
|
|
return 'scheme';
|
|
}
|
|
|
|
$path = (string)(parse_url($absolute, PHP_URL_PATH) ?: '');
|
|
if (preg_match('~\.(?:js|mjs|css|map|png|jpe?g|gif|svg|webp|ico|woff2?|ttf|eot)(?:$|[?#])~i', $path)) {
|
|
return 'asset';
|
|
}
|
|
|
|
foreach ($deliberatelyRemoved as $removed) {
|
|
if ($normalized === normalize_for_compare((string)$removed)) {
|
|
return 'suppressed';
|
|
}
|
|
}
|
|
return '';
|
|
}
|
|
|
|
function categorize_link(string $url, string $label): string
|
|
{
|
|
$host = strtolower((string)(parse_url($url, PHP_URL_HOST) ?: ''));
|
|
$path = strtolower((string)(parse_url($url, PHP_URL_PATH) ?: ''));
|
|
$haystack = strtolower($url . ' ' . $label);
|
|
if (str_ends_with($path, '.pdf')) return 'pdf';
|
|
if (str_ends_with($path, '.zip')) return 'archive';
|
|
if (str_ends_with($path, ".json") || str_contains($path, "numberdatabase")) return "dataset";
|
|
if (str_ends_with($path, ".py") || str_contains($haystack, "script")) return "source";
|
|
if (str_contains($haystack, 'frequency') || str_contains($haystack, 'frequential') || str_contains($haystack, 'octave') || str_contains($haystack, 'calculator') || str_contains($haystack, 'counter')) return 'tool';
|
|
if (str_contains($haystack, 'bot') || str_contains($host, 'perplexity.ai')) return 'bot';
|
|
if (str_contains($host, 'github')) return 'github';
|
|
if (str_contains($path, 'apples') || str_contains($haystack, 'app')) return 'applet';
|
|
if (str_contains($host, 'prezi')) return 'presentation';
|
|
if (str_contains($host, 'youtube') || str_contains($host, 'notion')) return 'media';
|
|
if ($host !== 'spireason.neocities.org') return 'external';
|
|
return 'onsite';
|
|
}
|
|
|
|
function category_label(string $category): string
|
|
{
|
|
return [
|
|
'pdf' => 'PDF text',
|
|
'archive' => 'Archive',
|
|
"dataset" => "Dataset",
|
|
"source" => "Source file",
|
|
'tool' => 'Tool',
|
|
'bot' => 'External bot/system',
|
|
'github' => 'Repository',
|
|
'applet' => 'Applet',
|
|
'presentation' => 'Presentation',
|
|
'media' => 'Media',
|
|
'external' => 'External system',
|
|
'onsite' => 'On-site branch',
|
|
][$category] ?? ucfirst(str_replace('-', ' ', $category));
|
|
}
|
|
|
|
function safety_class(string $host): string
|
|
{
|
|
if ($host === 'spireason.neocities.org') {
|
|
return 'source';
|
|
}
|
|
$knownHosts = [
|
|
'laegna.notaku.site',
|
|
'prezi.com',
|
|
'www.perplexity.ai',
|
|
'huggingface.co',
|
|
'assorted-canopy-961.notion.site',
|
|
'www.youtube.com',
|
|
'youtube.com',
|
|
'github.com',
|
|
'tambetvali.github.io',
|
|
];
|
|
foreach ($knownHosts as $knownHost) {
|
|
if ($host === $knownHost || str_ends_with($host, '.' . $knownHost)) {
|
|
return 'known-external';
|
|
}
|
|
}
|
|
return 'review';
|
|
}
|
|
|
|
function unicode_limit(string $text, int $limit): string
|
|
{
|
|
if (preg_match_all('/./us', $text, $chars) && count($chars[0]) > $limit) {
|
|
return implode('', array_slice($chars[0], 0, $limit - 1)) . '…';
|
|
}
|
|
return $text;
|
|
}
|
|
|
|
function normalize_for_compare(string $url): string
|
|
{
|
|
$url = trim($url);
|
|
$url = preg_replace('/#$/', '', $url) ?? $url;
|
|
return $url;
|
|
}
|
|
|
|
function absolutize_url(string $href, string $base): string
|
|
{
|
|
$href = trim($href);
|
|
if (preg_match('~^https?://~i', $href)) {
|
|
return encode_spaces($href);
|
|
}
|
|
$parts = parse_url($base);
|
|
$scheme = $parts['scheme'] ?? 'https';
|
|
$host = $parts['host'] ?? 'spireason.neocities.org';
|
|
if (str_starts_with($href, '//')) {
|
|
return encode_spaces($scheme . ':' . $href);
|
|
}
|
|
if (str_starts_with($href, '/')) {
|
|
return encode_spaces($scheme . '://' . $host . $href);
|
|
}
|
|
$path = $parts['path'] ?? '/';
|
|
$dir = preg_replace('~/[^/]*$~', '/', $path) ?: '/';
|
|
return encode_spaces($scheme . '://' . $host . $dir . $href);
|
|
}
|
|
|
|
function encode_spaces(string $url): string
|
|
{
|
|
return str_replace(' ', '%20', $url);
|
|
}
|