[ 'timeout' => 12, 'header' => "User-Agent: Laegna-LandScaper-VisibleLinkScanner/1.1\r\n", ], ]); $html = @file_get_contents($source, false, $context); if ($html === false || $html === '') { http_response_code(502); echo json_encode(['success' => false, 'error' => 'Could not fetch source page.']); exit; } $links = []; $seen = []; $ignoredCount = 0; $suppressedCount = 0; if (preg_match_all('~]*)>(.*?)~is', $html, $matches, PREG_SET_ORDER)) { foreach ($matches as $match) { $attrs = $match[1] ?? ''; $inner = $match[2] ?? ''; if (is_hidden_anchor($attrs)) { $ignoredCount++; continue; } if (!preg_match('~\bhref\s*=\s*(["\'])(.*?)\1~is', $attrs, $hrefMatch)) { $ignoredCount++; continue; } $href = html_entity_decode(trim($hrefMatch[2]), ENT_QUOTES | ENT_HTML5, 'UTF-8'); $label = visible_label($attrs, $inner); if ($label === '') { $ignoredCount++; continue; } $absolute = absolutize_url($href, $source); $normalized = normalize_for_compare($absolute); $skipReason = skip_reason($href, $absolute, $normalized, $deliberatelyRemoved); if ($skipReason === 'suppressed') { $suppressedCount++; continue; } if ($skipReason !== '') { $ignoredCount++; continue; } if (isset($seen[$normalized])) { continue; } $seen[$normalized] = true; $category = categorize_link($absolute, $label); $host = strtolower((string)(parse_url($absolute, PHP_URL_HOST) ?: '')); $links[] = [ 'text' => unicode_limit($label, 160), 'href' => $absolute, 'host' => $host, 'category' => $category, 'category_label' => category_label($category), 'safety' => safety_class($host), ]; } } echo json_encode([ 'success' => true, 'source' => $source, 'count' => count($links), 'ignored_count' => $ignoredCount, 'suppressed_count' => $suppressedCount, 'scanned_at' => gmdate('Y-m-d H:i:s') . ' UTC', 'filters' => [ 'visible_anchor_text_required' => true, 'skips_schemes' => ['javascript', 'mailto', 'tel', 'data', 'blob'], 'skips_asset_extensions' => ['js', 'mjs', 'css', 'map', 'png', 'jpg', 'jpeg', 'gif', 'svg', 'webp', 'ico', 'woff', 'woff2', 'ttf', 'eot'], 'unknown_external_hosts' => 'included but marked review', ], 'links' => $links, ], JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE); function is_hidden_anchor(string $attrs): bool { if (preg_match('~(?:^|\s)hidden(?:\s|=|$)~i', $attrs)) { return true; } if (preg_match('~\baria-hidden\s*=\s*(["\'])true\1~i', $attrs)) { return true; } if (preg_match('~\bstyle\s*=\s*(["\'])(.*?)\1~is', $attrs, $m)) { $style = strtolower($m[2]); return str_contains($style, 'display:none') || str_contains($style, 'display: none') || str_contains($style, 'visibility:hidden') || str_contains($style, 'visibility: hidden') || str_contains($style, 'opacity:0') || str_contains($style, 'opacity: 0'); } return false; } function visible_label(string $attrs, string $inner): string { $cleanInner = preg_replace('~<(script|style)\b[^>]*>.*?~is', ' ', $inner) ?? $inner; $text = html_entity_decode(strip_tags($cleanInner), ENT_QUOTES | ENT_HTML5, 'UTF-8'); $text = trim(preg_replace('/\s+/u', ' ', $text) ?? ''); if ($text !== '') { return $text; } foreach ([$attrs, $inner] as $source) { if (preg_match_all('~\b(?:aria-label|title|alt)\s*=\s*(["\'])(.*?)\1~is', $source, $matches, PREG_SET_ORDER)) { foreach ($matches as $match) { $candidate = trim(preg_replace('/\s+/u', ' ', html_entity_decode($match[2], ENT_QUOTES | ENT_HTML5, 'UTF-8')) ?? ''); if ($candidate !== '') { return $candidate; } } } } return ''; } function skip_reason(string $href, string $absolute, string $normalized, array $deliberatelyRemoved): string { $hrefLower = strtolower(trim($href)); if ($hrefLower === '' || $hrefLower[0] === '#') { return 'fragment'; } foreach (['javascript:', 'mailto:', 'tel:', 'data:', 'blob:'] as $scheme) { if (str_starts_with($hrefLower, $scheme)) { return 'scheme'; } } if (!preg_match('~^https?://~i', $absolute)) { return 'scheme'; } $path = (string)(parse_url($absolute, PHP_URL_PATH) ?: ''); if (preg_match('~\.(?:js|mjs|css|map|png|jpe?g|gif|svg|webp|ico|woff2?|ttf|eot)(?:$|[?#])~i', $path)) { return 'asset'; } foreach ($deliberatelyRemoved as $removed) { if ($normalized === normalize_for_compare((string)$removed)) { return 'suppressed'; } } return ''; } function categorize_link(string $url, string $label): string { $host = strtolower((string)(parse_url($url, PHP_URL_HOST) ?: '')); $path = strtolower((string)(parse_url($url, PHP_URL_PATH) ?: '')); $haystack = strtolower($url . ' ' . $label); if (str_ends_with($path, '.pdf')) return 'pdf'; if (str_ends_with($path, '.zip')) return 'archive'; if (str_ends_with($path, ".json") || str_contains($path, "numberdatabase")) return "dataset"; if (str_ends_with($path, ".py") || str_contains($haystack, "script")) return "source"; if (str_contains($haystack, 'frequency') || str_contains($haystack, 'frequential') || str_contains($haystack, 'octave') || str_contains($haystack, 'calculator') || str_contains($haystack, 'counter')) return 'tool'; if (str_contains($haystack, 'bot') || str_contains($host, 'perplexity.ai')) return 'bot'; if (str_contains($host, 'github')) return 'github'; if (str_contains($path, 'apples') || str_contains($haystack, 'app')) return 'applet'; if (str_contains($host, 'prezi')) return 'presentation'; if (str_contains($host, 'youtube') || str_contains($host, 'notion')) return 'media'; if ($host !== 'spireason.neocities.org') return 'external'; return 'onsite'; } function category_label(string $category): string { return [ 'pdf' => 'PDF text', 'archive' => 'Archive', "dataset" => "Dataset", "source" => "Source file", 'tool' => 'Tool', 'bot' => 'External bot/system', 'github' => 'Repository', 'applet' => 'Applet', 'presentation' => 'Presentation', 'media' => 'Media', 'external' => 'External system', 'onsite' => 'On-site branch', ][$category] ?? ucfirst(str_replace('-', ' ', $category)); } function safety_class(string $host): string { if ($host === 'spireason.neocities.org') { return 'source'; } $knownHosts = [ 'laegna.notaku.site', 'prezi.com', 'www.perplexity.ai', 'huggingface.co', 'assorted-canopy-961.notion.site', 'www.youtube.com', 'youtube.com', 'github.com', 'tambetvali.github.io', ]; foreach ($knownHosts as $knownHost) { if ($host === $knownHost || str_ends_with($host, '.' . $knownHost)) { return 'known-external'; } } return 'review'; } function unicode_limit(string $text, int $limit): string { if (preg_match_all('/./us', $text, $chars) && count($chars[0]) > $limit) { return implode('', array_slice($chars[0], 0, $limit - 1)) . '…'; } return $text; } function normalize_for_compare(string $url): string { $url = trim($url); $url = preg_replace('/#$/', '', $url) ?? $url; return $url; } function absolutize_url(string $href, string $base): string { $href = trim($href); if (preg_match('~^https?://~i', $href)) { return encode_spaces($href); } $parts = parse_url($base); $scheme = $parts['scheme'] ?? 'https'; $host = $parts['host'] ?? 'spireason.neocities.org'; if (str_starts_with($href, '//')) { return encode_spaces($scheme . ':' . $href); } if (str_starts_with($href, '/')) { return encode_spaces($scheme . '://' . $host . $href); } $path = $parts['path'] ?? '/'; $dir = preg_replace('~/[^/]*$~', '/', $path) ?: '/'; return encode_spaces($scheme . '://' . $host . $dir . $href); } function encode_spaces(string $url): string { return str_replace(' ', '%20', $url); }