317 lines
13 KiB
PHP
317 lines
13 KiB
PHP
<?php
|
|
declare(strict_types=1);
|
|
header("Content-Type: application/json; charset=utf-8");
|
|
|
|
require_once __DIR__ . "/../includes/curated_data.php";
|
|
|
|
$source = "https://spireason.neocities.org/";
|
|
$ids = [
|
|
["id"=>"growing-out-of-the-shadows", "icon"=>"⚛️", "title"=>"Growing Out of the Shadows"],
|
|
["id"=>"sheep", "icon"=>"🐑", "title"=>"Sheep / Laegna Counters"],
|
|
["id"=>"infinity", "icon"=>"♾️", "title"=>"Infinity"],
|
|
["id"=>"natura", "icon"=>"🌀", "title"=>"Natura"],
|
|
["id"=>"sunrise", "icon"=>"🔆", "title"=>"Sunrise"],
|
|
["id"=>"bigbang", "icon"=>"💥", "title"=>"Bigbang"],
|
|
["id"=>"yggdrasill", "icon"=>"🌳", "title"=>"Yggdrasill"],
|
|
["id"=>"spiritrise", "icon"=>"🔷", "title"=>"Spiritrise"],
|
|
["id"=>"laelab", "icon"=>"🔬", "title"=>"LaeLab"],
|
|
["id"=>"geneticar", "icon"=>"🧬", "title"=>"Geneticar"],
|
|
["id"=>"handheldcal", "icon"=>"🖩", "title"=>"HandheldCal"],
|
|
["id"=>"puzzled", "icon"=>"🧩", "title"=>"Puzzled"],
|
|
["id"=>"chakra", "icon"=>"✴️", "title"=>"Chakra"],
|
|
["id"=>"wheelsgoround", "icon"=>"⚙️", "title"=>"Wheels Go Round"],
|
|
["id"=>"laemedics", "icon"=>"🧘", "title"=>"LaeMedics"],
|
|
["id"=>"coffeeandcigarettes", "icon"=>"☕", "title"=>"Coffee and Cigarettes"],
|
|
];
|
|
|
|
$html = fetch_source($source);
|
|
if ($html === null) {
|
|
http_response_code(502);
|
|
echo json_encode(["success"=>false, "error"=>"Could not fetch source page"], JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE);
|
|
exit;
|
|
}
|
|
|
|
$positions = [];
|
|
foreach ($ids as $item) {
|
|
$pattern = '~\bid\s*=\s*["\x27]' . preg_quote($item["id"], '~') . '["\x27]~i';
|
|
if (preg_match($pattern, $html, $m, PREG_OFFSET_CAPTURE)) {
|
|
$item["pos"] = $m[0][1];
|
|
$positions[] = $item;
|
|
}
|
|
}
|
|
usort($positions, fn($a, $b) => $a["pos"] <=> $b["pos"]);
|
|
|
|
$sections = [];
|
|
$totalLinks = 0;
|
|
$utilityChrome = [];
|
|
foreach ($positions as $idx => $item) {
|
|
$start = (int)$item["pos"];
|
|
$end = isset($positions[$idx + 1]) ? (int)$positions[$idx + 1]["pos"] : strlen($html);
|
|
$chunk = substr($html, $start, $end - $start);
|
|
$ownedLinks = [];
|
|
if ($item["id"] === "coffeeandcigarettes") {
|
|
// Coffee is terminal prose. Links after it are footer/fixed/absolute utility chrome,
|
|
// not branch links owned by the Coffee icon.
|
|
$utilityChrome = parse_visible_links($chunk, $source);
|
|
} else {
|
|
$ownedLinks = parse_visible_links($chunk, $source);
|
|
$totalLinks += count($ownedLinks);
|
|
}
|
|
$layout = $sectionLayout[$item["id"]] ?? null;
|
|
$sections[] = [
|
|
"id" => $item["id"],
|
|
"icon" => $item["icon"],
|
|
"title" => $item["title"],
|
|
"source_anchor" => "#" . $item["id"],
|
|
"owns_until" => $item["id"] === "coffeeandcigarettes" ? "utility chrome" : (isset($positions[$idx + 1]) ? "#" . $positions[$idx + 1]["id"] : "end"),
|
|
"layout" => $layout,
|
|
"excerpt" => unicode_limit(clean_text($chunk), 420),
|
|
"link_count" => count($ownedLinks),
|
|
"links" => $ownedLinks,
|
|
];
|
|
}
|
|
|
|
$shadowBonus = [
|
|
["icon"=>"🎭", "title"=>"Dancing Shadows", "href"=>"https://material-psychic-gam-8mo3.bolt.host/"],
|
|
["icon"=>"📘", "title"=>"Book of Shadows 2", "href"=>"https://app-bxfrqbbqegap.appmedo.com/"],
|
|
["icon"=>"📕", "title"=>"Bulk Load Book of the Dead 3", "href"=>"https://app-by9gm7mu9ssh.appmedo.com/"],
|
|
];
|
|
|
|
echo json_encode([
|
|
"success" => true,
|
|
"source" => $source,
|
|
"scanned_at" => gmdate("Y-m-d H:i:s") . " UTC",
|
|
"model" => "CSS-aware: first 100vh console and fixed/absolute utility chrome are separated; document-flow icon owners keep links until the next meaningful icon/title; Coffee is terminal prose and owns no branch links.",
|
|
"layout_zones" => $layoutZones ?? [],
|
|
"section_count" => count($sections),
|
|
"link_count" => $totalLinks,
|
|
"utility_count" => count($utilityChrome),
|
|
"utility_chrome" => $utilityChrome,
|
|
"shadow_bonus" => $shadowBonus,
|
|
"sections" => $sections,
|
|
], JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE);
|
|
|
|
function fetch_source(string $url): ?string
|
|
{
|
|
$context = stream_context_create([
|
|
"http" => [
|
|
"timeout" => 18,
|
|
"header" => "User-Agent: LandScaper structural scanner\r\n",
|
|
],
|
|
]);
|
|
$html = @file_get_contents($url, false, $context);
|
|
return is_string($html) && $html !== "" ? $html : null;
|
|
}
|
|
|
|
function parse_visible_links(string $chunk, string $base): array
|
|
{
|
|
$links = [];
|
|
if (!preg_match_all("~<a\b([^>]*)>(.*?)</a>~is", $chunk, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE)) {
|
|
return $links;
|
|
}
|
|
foreach ($matches as $match) {
|
|
$attrs = $match[1][0] ?? "";
|
|
$inner = $match[2][0] ?? "";
|
|
$offset = (int)($match[0][1] ?? 0);
|
|
if (is_hidden_anchor($attrs)) {
|
|
continue;
|
|
}
|
|
$href = attr_value($attrs, "href");
|
|
$label = visible_label($attrs, $inner);
|
|
if ($href === "" || $label === "") {
|
|
continue;
|
|
}
|
|
$absolute = absolute_url($href, $base);
|
|
$reason = skip_reason($href, $absolute);
|
|
if ($reason !== "") {
|
|
continue;
|
|
}
|
|
$host = strtolower((string)(parse_url($absolute, PHP_URL_HOST) ?: ""));
|
|
$category = categorize_link($absolute, $label);
|
|
$links[] = [
|
|
"text" => unicode_limit($label, 150),
|
|
"href" => $absolute,
|
|
"host" => $host,
|
|
"category" => $category,
|
|
"category_label" => category_label($category),
|
|
"context" => context_hint($chunk, $offset),
|
|
"layout_hint" => layout_hint($attrs, $chunk, $offset),
|
|
"safety" => safety_class($host),
|
|
];
|
|
}
|
|
return $links;
|
|
}
|
|
|
|
|
|
function context_hint(string $chunk, int $offset): string
|
|
{
|
|
$prefix = substr($chunk, max(0, $offset - 1800), min(1800, $offset));
|
|
if (!preg_match_all('~<(article|section|blockquote|div|p|big|center)\b([^>]*)>~i', $prefix, $tags, PREG_SET_ORDER)) {
|
|
return 'flow text';
|
|
}
|
|
$tag = end($tags);
|
|
$name = strtolower($tag[1] ?? 'flow');
|
|
$attrs = $tag[2] ?? '';
|
|
$id = attr_value($attrs, 'id');
|
|
$class = attr_value($attrs, 'class');
|
|
$label = $name;
|
|
if ($id !== '') $label .= '#' . $id;
|
|
if ($class !== '') $label .= '.' . preg_replace('/\s+/', '.', trim($class));
|
|
return $label;
|
|
}
|
|
|
|
function layout_hint(string $attrs, string $chunk, int $offset): string
|
|
{
|
|
$style = strtolower(attr_value($attrs, 'style'));
|
|
$class = strtolower(attr_value($attrs, 'class'));
|
|
$prefix = strtolower(substr($chunk, max(0, $offset - 1300), min(1300, $offset)));
|
|
$haystack = $style . ' ' . $class . ' ' . $prefix;
|
|
if (str_contains($haystack, 'position: fixed') || str_contains($haystack, 'position:fixed')) return 'fixed chrome';
|
|
if (str_contains($haystack, 'position: absolute') || str_contains($haystack, 'position:absolute') || str_contains($haystack, 'calc(100vh')) return 'absolute-positioned chrome';
|
|
if (str_contains($haystack, 'position: relative') || str_contains($haystack, 'position:relative')) return 'relative positioned block';
|
|
if (str_contains($haystack, '<article')) return 'article branch link';
|
|
if (str_contains($haystack, '<section')) return 'section branch link';
|
|
if (str_contains($haystack, '<blockquote')) return 'blockquote branch link';
|
|
return 'document-flow link';
|
|
}
|
|
|
|
function attr_value(string $attrs, string $name): string
|
|
{
|
|
$pattern = '~\b' . preg_quote($name, '~') . '\s*=\s*(["\x27])(.*?)\1~is';
|
|
if (preg_match($pattern, $attrs, $m)) {
|
|
return html_entity_decode(trim($m[2]), ENT_QUOTES | ENT_HTML5, "UTF-8");
|
|
}
|
|
return "";
|
|
}
|
|
|
|
function is_hidden_anchor(string $attrs): bool
|
|
{
|
|
if (preg_match("~(?:^|\s)hidden(?:\s|=|$)~i", $attrs)) return true;
|
|
if (preg_match('~\baria-hidden\s*=\s*(["\x27])true\1~i', $attrs)) return true;
|
|
$style = attr_value($attrs, "style");
|
|
if ($style !== "") {
|
|
$style = strtolower($style);
|
|
return str_contains($style, "display:none") || str_contains($style, "display: none") || str_contains($style, "visibility:hidden") || str_contains($style, "visibility: hidden") || str_contains($style, "opacity:0") || str_contains($style, "opacity: 0");
|
|
}
|
|
return false;
|
|
}
|
|
|
|
function visible_label(string $attrs, string $inner): string
|
|
{
|
|
$cleanInner = preg_replace("~<(script|style)\b[^>]*>.*?</\1>~is", " ", $inner) ?? $inner;
|
|
$text = html_entity_decode(strip_tags($cleanInner), ENT_QUOTES | ENT_HTML5, "UTF-8");
|
|
$text = trim(preg_replace("/\s+/u", " ", $text) ?? "");
|
|
if ($text !== "") return $text;
|
|
foreach (["aria-label", "title", "alt"] as $attr) {
|
|
$candidate = attr_value($attrs, $attr);
|
|
if ($candidate !== "") return trim(preg_replace("/\s+/u", " ", $candidate) ?? "");
|
|
}
|
|
return "";
|
|
}
|
|
|
|
function skip_reason(string $href, string $absolute): string
|
|
{
|
|
$hrefLower = strtolower(trim($href));
|
|
if ($hrefLower === "" || $hrefLower[0] === "#") return "fragment";
|
|
foreach (["javascript:", "mailto:", "tel:", "data:", "blob:"] as $scheme) {
|
|
if (str_starts_with($hrefLower, $scheme)) return "scheme";
|
|
}
|
|
if (!preg_match("~^https?://~i", $absolute)) return "scheme";
|
|
$path = (string)(parse_url($absolute, PHP_URL_PATH) ?: "");
|
|
if (preg_match("~\.(?:js|mjs|css|map|png|jpe?g|gif|svg|webp|ico|woff2?|ttf|eot)(?:$|[?#])~i", $path)) return "asset";
|
|
return "";
|
|
}
|
|
|
|
function absolute_url(string $href, string $base): string
|
|
{
|
|
$href = html_entity_decode(trim($href), ENT_QUOTES | ENT_HTML5, "UTF-8");
|
|
if (preg_match("~^https?://~i", $href)) return $href;
|
|
if (str_starts_with($href, "//")) return "https:" . $href;
|
|
$parts = parse_url($base);
|
|
$scheme = $parts["scheme"] ?? "https";
|
|
$host = $parts["host"] ?? "spireason.neocities.org";
|
|
if (str_starts_with($href, "/")) return $scheme . "://" . $host . $href;
|
|
$basePath = $parts["path"] ?? "/";
|
|
$dir = rtrim(dirname($basePath), "/");
|
|
if ($dir === "") $dir = "/";
|
|
return $scheme . "://" . $host . rtrim($dir, "/") . "/" . str_replace(" ", "%20", $href);
|
|
}
|
|
|
|
function clean_text(string $html): string
|
|
{
|
|
$html = preg_replace("~<(script|style)\b[^>]*>.*?</\1>~is", " ", $html) ?? $html;
|
|
$text = html_entity_decode(strip_tags($html), ENT_QUOTES | ENT_HTML5, "UTF-8");
|
|
$text = preg_replace("~\bid=\"[^\"]+\"\s*>~u", " ", $text) ?? $text;
|
|
$text = preg_replace("~[#.][A-Za-z0-9_-][^{}]{0,220}\\{[^{}]*\\}~u", " ", $text) ?? $text;
|
|
$text = preg_replace("~\b(?:font|color|background|padding|margin|position|display|border|box-shadow|width|height|transform|opacity|z-index|transition)[^.;]{0,180}[.;]~iu", " ", $text) ?? $text;
|
|
$text = preg_replace("~/\\*.*?\\*/~s", " ", $text) ?? $text;
|
|
$text = preg_replace("~@media[^{}]*\\{\\s*\\}~u", " ", $text) ?? $text;
|
|
return trim(preg_replace("/\s+/u", " ", $text) ?? "");
|
|
}
|
|
|
|
function categorize_link(string $url, string $label): string
|
|
{
|
|
$host = strtolower((string)(parse_url($url, PHP_URL_HOST) ?: ""));
|
|
$path = strtolower((string)(parse_url($url, PHP_URL_PATH) ?: ""));
|
|
$haystack = strtolower($url . " " . $label);
|
|
if (str_ends_with($path, ".pdf")) return "pdf";
|
|
if (str_ends_with($path, ".zip")) return "archive";
|
|
if (str_ends_with($path, ".json") || str_contains($path, "numberdatabase")) return "dataset";
|
|
if (str_ends_with($path, ".py") || str_contains($haystack, "script") || str_contains($haystack, "github")) return "source";
|
|
if (str_contains($haystack, "frequency") || str_contains($haystack, "frequential") || str_contains($haystack, "octave") || str_contains($haystack, "calculator") || str_contains($haystack, "counter")) return "tool";
|
|
if (str_contains($haystack, "bot") || str_contains($host, "perplexity.ai")) return "bot";
|
|
if (str_contains($host, "github")) return "github";
|
|
if (str_contains($path, "apples") || str_contains($haystack, "app") || str_contains($host, "lovable.app") || str_contains($host, "bolt.host") || str_contains($host, "appmedo.com")) return "applet";
|
|
if (str_contains($host, "prezi") || str_contains($host, "docs.google")) return "presentation";
|
|
if (str_contains($host, "youtube") || str_contains($host, "notion")) return "media";
|
|
if ($host !== "spireason.neocities.org") return "external";
|
|
return "onsite";
|
|
}
|
|
|
|
function category_label(string $category): string
|
|
{
|
|
return [
|
|
"pdf" => "PDF text",
|
|
"archive" => "Archive",
|
|
"dataset" => "Dataset",
|
|
"source" => "Source file",
|
|
"tool" => "Tool",
|
|
"bot" => "External bot/system",
|
|
"github" => "Repository",
|
|
"applet" => "Applet",
|
|
"presentation" => "Presentation",
|
|
"media" => "Media",
|
|
"external" => "External system",
|
|
"onsite" => "On-site branch",
|
|
][$category] ?? ucfirst(str_replace("-", " ", $category));
|
|
}
|
|
|
|
function safety_class(string $host): string
|
|
{
|
|
if ($host === "spireason.neocities.org") return "source";
|
|
$knownHosts = [
|
|
"laegna.notaku.site",
|
|
"prezi.com",
|
|
"www.perplexity.ai",
|
|
"huggingface.co",
|
|
"assorted-canopy-961.notion.site",
|
|
"www.youtube.com",
|
|
"youtube.com",
|
|
"github.com",
|
|
"tambetvali.github.io",
|
|
"archive.org",
|
|
];
|
|
foreach ($knownHosts as $knownHost) {
|
|
if ($host === $knownHost || str_ends_with($host, "." . $knownHost)) return "known-external";
|
|
}
|
|
return "review";
|
|
}
|
|
|
|
function unicode_limit(string $text, int $limit): string
|
|
{
|
|
if (preg_match_all("/./us", $text, $chars) && count($chars[0]) > $limit) {
|
|
return implode("", array_slice($chars[0], 0, max(0, $limit - 1))) . "…";
|
|
}
|
|
return $text;
|
|
}
|