40247-vm/api/scan_structure.php
2026-06-12 09:15:36 +00:00

317 lines
13 KiB
PHP

<?php
declare(strict_types=1);
header("Content-Type: application/json; charset=utf-8");
require_once __DIR__ . "/../includes/curated_data.php";
$source = "https://spireason.neocities.org/";
$ids = [
["id"=>"growing-out-of-the-shadows", "icon"=>"⚛️", "title"=>"Growing Out of the Shadows"],
["id"=>"sheep", "icon"=>"🐑", "title"=>"Sheep / Laegna Counters"],
["id"=>"infinity", "icon"=>"♾️", "title"=>"Infinity"],
["id"=>"natura", "icon"=>"🌀", "title"=>"Natura"],
["id"=>"sunrise", "icon"=>"🔆", "title"=>"Sunrise"],
["id"=>"bigbang", "icon"=>"💥", "title"=>"Bigbang"],
["id"=>"yggdrasill", "icon"=>"🌳", "title"=>"Yggdrasill"],
["id"=>"spiritrise", "icon"=>"🔷", "title"=>"Spiritrise"],
["id"=>"laelab", "icon"=>"🔬", "title"=>"LaeLab"],
["id"=>"geneticar", "icon"=>"🧬", "title"=>"Geneticar"],
["id"=>"handheldcal", "icon"=>"🖩", "title"=>"HandheldCal"],
["id"=>"puzzled", "icon"=>"🧩", "title"=>"Puzzled"],
["id"=>"chakra", "icon"=>"✴️", "title"=>"Chakra"],
["id"=>"wheelsgoround", "icon"=>"⚙️", "title"=>"Wheels Go Round"],
["id"=>"laemedics", "icon"=>"🧘", "title"=>"LaeMedics"],
["id"=>"coffeeandcigarettes", "icon"=>"", "title"=>"Coffee and Cigarettes"],
];
$html = fetch_source($source);
if ($html === null) {
http_response_code(502);
echo json_encode(["success"=>false, "error"=>"Could not fetch source page"], JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE);
exit;
}
$positions = [];
foreach ($ids as $item) {
$pattern = '~\bid\s*=\s*["\x27]' . preg_quote($item["id"], '~') . '["\x27]~i';
if (preg_match($pattern, $html, $m, PREG_OFFSET_CAPTURE)) {
$item["pos"] = $m[0][1];
$positions[] = $item;
}
}
usort($positions, fn($a, $b) => $a["pos"] <=> $b["pos"]);
$sections = [];
$totalLinks = 0;
$utilityChrome = [];
foreach ($positions as $idx => $item) {
$start = (int)$item["pos"];
$end = isset($positions[$idx + 1]) ? (int)$positions[$idx + 1]["pos"] : strlen($html);
$chunk = substr($html, $start, $end - $start);
$ownedLinks = [];
if ($item["id"] === "coffeeandcigarettes") {
// Coffee is terminal prose. Links after it are footer/fixed/absolute utility chrome,
// not branch links owned by the Coffee icon.
$utilityChrome = parse_visible_links($chunk, $source);
} else {
$ownedLinks = parse_visible_links($chunk, $source);
$totalLinks += count($ownedLinks);
}
$layout = $sectionLayout[$item["id"]] ?? null;
$sections[] = [
"id" => $item["id"],
"icon" => $item["icon"],
"title" => $item["title"],
"source_anchor" => "#" . $item["id"],
"owns_until" => $item["id"] === "coffeeandcigarettes" ? "utility chrome" : (isset($positions[$idx + 1]) ? "#" . $positions[$idx + 1]["id"] : "end"),
"layout" => $layout,
"excerpt" => unicode_limit(clean_text($chunk), 420),
"link_count" => count($ownedLinks),
"links" => $ownedLinks,
];
}
$shadowBonus = [
["icon"=>"🎭", "title"=>"Dancing Shadows", "href"=>"https://material-psychic-gam-8mo3.bolt.host/"],
["icon"=>"📘", "title"=>"Book of Shadows 2", "href"=>"https://app-bxfrqbbqegap.appmedo.com/"],
["icon"=>"📕", "title"=>"Bulk Load Book of the Dead 3", "href"=>"https://app-by9gm7mu9ssh.appmedo.com/"],
];
echo json_encode([
"success" => true,
"source" => $source,
"scanned_at" => gmdate("Y-m-d H:i:s") . " UTC",
"model" => "CSS-aware: first 100vh console and fixed/absolute utility chrome are separated; document-flow icon owners keep links until the next meaningful icon/title; Coffee is terminal prose and owns no branch links.",
"layout_zones" => $layoutZones ?? [],
"section_count" => count($sections),
"link_count" => $totalLinks,
"utility_count" => count($utilityChrome),
"utility_chrome" => $utilityChrome,
"shadow_bonus" => $shadowBonus,
"sections" => $sections,
], JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE);
function fetch_source(string $url): ?string
{
$context = stream_context_create([
"http" => [
"timeout" => 18,
"header" => "User-Agent: LandScaper structural scanner\r\n",
],
]);
$html = @file_get_contents($url, false, $context);
return is_string($html) && $html !== "" ? $html : null;
}
function parse_visible_links(string $chunk, string $base): array
{
$links = [];
if (!preg_match_all("~<a\b([^>]*)>(.*?)</a>~is", $chunk, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE)) {
return $links;
}
foreach ($matches as $match) {
$attrs = $match[1][0] ?? "";
$inner = $match[2][0] ?? "";
$offset = (int)($match[0][1] ?? 0);
if (is_hidden_anchor($attrs)) {
continue;
}
$href = attr_value($attrs, "href");
$label = visible_label($attrs, $inner);
if ($href === "" || $label === "") {
continue;
}
$absolute = absolute_url($href, $base);
$reason = skip_reason($href, $absolute);
if ($reason !== "") {
continue;
}
$host = strtolower((string)(parse_url($absolute, PHP_URL_HOST) ?: ""));
$category = categorize_link($absolute, $label);
$links[] = [
"text" => unicode_limit($label, 150),
"href" => $absolute,
"host" => $host,
"category" => $category,
"category_label" => category_label($category),
"context" => context_hint($chunk, $offset),
"layout_hint" => layout_hint($attrs, $chunk, $offset),
"safety" => safety_class($host),
];
}
return $links;
}
function context_hint(string $chunk, int $offset): string
{
$prefix = substr($chunk, max(0, $offset - 1800), min(1800, $offset));
if (!preg_match_all('~<(article|section|blockquote|div|p|big|center)\b([^>]*)>~i', $prefix, $tags, PREG_SET_ORDER)) {
return 'flow text';
}
$tag = end($tags);
$name = strtolower($tag[1] ?? 'flow');
$attrs = $tag[2] ?? '';
$id = attr_value($attrs, 'id');
$class = attr_value($attrs, 'class');
$label = $name;
if ($id !== '') $label .= '#' . $id;
if ($class !== '') $label .= '.' . preg_replace('/\s+/', '.', trim($class));
return $label;
}
function layout_hint(string $attrs, string $chunk, int $offset): string
{
$style = strtolower(attr_value($attrs, 'style'));
$class = strtolower(attr_value($attrs, 'class'));
$prefix = strtolower(substr($chunk, max(0, $offset - 1300), min(1300, $offset)));
$haystack = $style . ' ' . $class . ' ' . $prefix;
if (str_contains($haystack, 'position: fixed') || str_contains($haystack, 'position:fixed')) return 'fixed chrome';
if (str_contains($haystack, 'position: absolute') || str_contains($haystack, 'position:absolute') || str_contains($haystack, 'calc(100vh')) return 'absolute-positioned chrome';
if (str_contains($haystack, 'position: relative') || str_contains($haystack, 'position:relative')) return 'relative positioned block';
if (str_contains($haystack, '<article')) return 'article branch link';
if (str_contains($haystack, '<section')) return 'section branch link';
if (str_contains($haystack, '<blockquote')) return 'blockquote branch link';
return 'document-flow link';
}
function attr_value(string $attrs, string $name): string
{
$pattern = '~\b' . preg_quote($name, '~') . '\s*=\s*(["\x27])(.*?)\1~is';
if (preg_match($pattern, $attrs, $m)) {
return html_entity_decode(trim($m[2]), ENT_QUOTES | ENT_HTML5, "UTF-8");
}
return "";
}
function is_hidden_anchor(string $attrs): bool
{
if (preg_match("~(?:^|\s)hidden(?:\s|=|$)~i", $attrs)) return true;
if (preg_match('~\baria-hidden\s*=\s*(["\x27])true\1~i', $attrs)) return true;
$style = attr_value($attrs, "style");
if ($style !== "") {
$style = strtolower($style);
return str_contains($style, "display:none") || str_contains($style, "display: none") || str_contains($style, "visibility:hidden") || str_contains($style, "visibility: hidden") || str_contains($style, "opacity:0") || str_contains($style, "opacity: 0");
}
return false;
}
function visible_label(string $attrs, string $inner): string
{
$cleanInner = preg_replace("~<(script|style)\b[^>]*>.*?</\1>~is", " ", $inner) ?? $inner;
$text = html_entity_decode(strip_tags($cleanInner), ENT_QUOTES | ENT_HTML5, "UTF-8");
$text = trim(preg_replace("/\s+/u", " ", $text) ?? "");
if ($text !== "") return $text;
foreach (["aria-label", "title", "alt"] as $attr) {
$candidate = attr_value($attrs, $attr);
if ($candidate !== "") return trim(preg_replace("/\s+/u", " ", $candidate) ?? "");
}
return "";
}
function skip_reason(string $href, string $absolute): string
{
$hrefLower = strtolower(trim($href));
if ($hrefLower === "" || $hrefLower[0] === "#") return "fragment";
foreach (["javascript:", "mailto:", "tel:", "data:", "blob:"] as $scheme) {
if (str_starts_with($hrefLower, $scheme)) return "scheme";
}
if (!preg_match("~^https?://~i", $absolute)) return "scheme";
$path = (string)(parse_url($absolute, PHP_URL_PATH) ?: "");
if (preg_match("~\.(?:js|mjs|css|map|png|jpe?g|gif|svg|webp|ico|woff2?|ttf|eot)(?:$|[?#])~i", $path)) return "asset";
return "";
}
function absolute_url(string $href, string $base): string
{
$href = html_entity_decode(trim($href), ENT_QUOTES | ENT_HTML5, "UTF-8");
if (preg_match("~^https?://~i", $href)) return $href;
if (str_starts_with($href, "//")) return "https:" . $href;
$parts = parse_url($base);
$scheme = $parts["scheme"] ?? "https";
$host = $parts["host"] ?? "spireason.neocities.org";
if (str_starts_with($href, "/")) return $scheme . "://" . $host . $href;
$basePath = $parts["path"] ?? "/";
$dir = rtrim(dirname($basePath), "/");
if ($dir === "") $dir = "/";
return $scheme . "://" . $host . rtrim($dir, "/") . "/" . str_replace(" ", "%20", $href);
}
function clean_text(string $html): string
{
$html = preg_replace("~<(script|style)\b[^>]*>.*?</\1>~is", " ", $html) ?? $html;
$text = html_entity_decode(strip_tags($html), ENT_QUOTES | ENT_HTML5, "UTF-8");
$text = preg_replace("~\bid=\"[^\"]+\"\s*>~u", " ", $text) ?? $text;
$text = preg_replace("~[#.][A-Za-z0-9_-][^{}]{0,220}\\{[^{}]*\\}~u", " ", $text) ?? $text;
$text = preg_replace("~\b(?:font|color|background|padding|margin|position|display|border|box-shadow|width|height|transform|opacity|z-index|transition)[^.;]{0,180}[.;]~iu", " ", $text) ?? $text;
$text = preg_replace("~/\\*.*?\\*/~s", " ", $text) ?? $text;
$text = preg_replace("~@media[^{}]*\\{\\s*\\}~u", " ", $text) ?? $text;
return trim(preg_replace("/\s+/u", " ", $text) ?? "");
}
function categorize_link(string $url, string $label): string
{
$host = strtolower((string)(parse_url($url, PHP_URL_HOST) ?: ""));
$path = strtolower((string)(parse_url($url, PHP_URL_PATH) ?: ""));
$haystack = strtolower($url . " " . $label);
if (str_ends_with($path, ".pdf")) return "pdf";
if (str_ends_with($path, ".zip")) return "archive";
if (str_ends_with($path, ".json") || str_contains($path, "numberdatabase")) return "dataset";
if (str_ends_with($path, ".py") || str_contains($haystack, "script") || str_contains($haystack, "github")) return "source";
if (str_contains($haystack, "frequency") || str_contains($haystack, "frequential") || str_contains($haystack, "octave") || str_contains($haystack, "calculator") || str_contains($haystack, "counter")) return "tool";
if (str_contains($haystack, "bot") || str_contains($host, "perplexity.ai")) return "bot";
if (str_contains($host, "github")) return "github";
if (str_contains($path, "apples") || str_contains($haystack, "app") || str_contains($host, "lovable.app") || str_contains($host, "bolt.host") || str_contains($host, "appmedo.com")) return "applet";
if (str_contains($host, "prezi") || str_contains($host, "docs.google")) return "presentation";
if (str_contains($host, "youtube") || str_contains($host, "notion")) return "media";
if ($host !== "spireason.neocities.org") return "external";
return "onsite";
}
function category_label(string $category): string
{
return [
"pdf" => "PDF text",
"archive" => "Archive",
"dataset" => "Dataset",
"source" => "Source file",
"tool" => "Tool",
"bot" => "External bot/system",
"github" => "Repository",
"applet" => "Applet",
"presentation" => "Presentation",
"media" => "Media",
"external" => "External system",
"onsite" => "On-site branch",
][$category] ?? ucfirst(str_replace("-", " ", $category));
}
function safety_class(string $host): string
{
if ($host === "spireason.neocities.org") return "source";
$knownHosts = [
"laegna.notaku.site",
"prezi.com",
"www.perplexity.ai",
"huggingface.co",
"assorted-canopy-961.notion.site",
"www.youtube.com",
"youtube.com",
"github.com",
"tambetvali.github.io",
"archive.org",
];
foreach ($knownHosts as $knownHost) {
if ($host === $knownHost || str_ends_with($host, "." . $knownHost)) return "known-external";
}
return "review";
}
function unicode_limit(string $text, int $limit): string
{
if (preg_match_all("/./us", $text, $chars) && count($chars[0]) > $limit) {
return implode("", array_slice($chars[0], 0, max(0, $limit - 1))) . "";
}
return $text;
}