39246-vm/worker.py
2026-03-20 05:56:48 +00:00

6043 lines
277 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Ghost Node — Worker
Three-thread architecture:
Thread A → FastAPI dashboard (port 3001)
Thread B → Async Playwright scraper (nuclear_engine)
Thread C → Telegram C2 polling loop
"""
from __future__ import annotations
import sys
if hasattr(sys.stdout, 'reconfigure'):
sys.stdout.reconfigure(encoding='utf-8')
sys.stderr.reconfigure(encoding='utf-8')
import asyncio
import collections
import difflib
import json
import os
import platform
import random
import re
import sys
import threading
import time
from contextlib import asynccontextmanager
from datetime import datetime, timedelta
from typing import Any, Optional
import httpx
import uvicorn
from fastapi import Depends, FastAPI, Request
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import FileResponse, HTMLResponse, JSONResponse
from fastapi.staticfiles import StaticFiles
from playwright.async_api import async_playwright
from sqlalchemy.orm import Session
from database import SessionLocal, get_db
from models import Config, Keyword, Listing, ScoringRule, SiteSelectors, TargetSite, ScrapeRound, ScrapeRoundItem, calculate_attribute_score, seed_database
# ─────────────────────────────────────────────────────────────────────────────
# Redis — optional write-through cache + pub/sub
# Enabled when REDIS_URL env var is set (e.g. redis://localhost:6379/0).
# If Redis is absent or unavailable the app runs identically using in-memory
# dicts — zero breakage, zero required configuration.
# ─────────────────────────────────────────────────────────────────────────────
_redis_client = None
def _init_redis() -> None:
"""Connect to Redis if REDIS_URL is set. Called once at startup."""
global _redis_client
url = os.environ.get("REDIS_URL", "").strip()
if not url:
print("[Redis] REDIS_URL not set — running without Redis cache.")
return
try:
import redis as _redis_lib
_redis_client = _redis_lib.from_url(url, decode_responses=True, socket_connect_timeout=3)
_redis_client.ping()
print(f"[Redis] ✅ Connected: {url}")
except ImportError:
print("[Redis] ⚠️ redis package not installed. Run: pip install redis")
_redis_client = None
except Exception as exc:
print(f"[Redis] ⚠️ Connection failed ({exc}) — falling back to in-memory.")
_redis_client = None
_REDIS_STATS_KEY = "ghostnode:stats"
_REDIS_PUBSUB_CH = "ghostnode:events"
def _redis_set_stats(stats: dict) -> None:
"""Write stats dict to Redis hash. No-op if Redis unavailable."""
if _redis_client is None:
return
try:
_redis_client.hset(_REDIS_STATS_KEY, mapping={k: str(v) for k, v in stats.items()})
_redis_client.expire(_REDIS_STATS_KEY, 3600) # auto-expire after 1h idle
except Exception:
pass
def _redis_publish(event_type: str, payload: dict) -> None:
"""Publish a JSON event to the ghostnode:events pub/sub channel."""
if _redis_client is None:
return
try:
_redis_client.publish(_REDIS_PUBSUB_CH, json.dumps({"type": event_type, **payload}))
except Exception:
pass
def _redis_cache_set(key: str, value: str, ex: int = 300) -> None:
"""Generic set with TTL. Used for rate caches, config snapshots, etc."""
if _redis_client is None:
return
try:
_redis_client.set(key, value, ex=ex)
except Exception:
pass
def _redis_cache_get(key: str) -> str | None:
"""Generic get. Returns None if Redis unavailable or key missing."""
if _redis_client is None:
return None
try:
return _redis_client.get(key)
except Exception:
return None
# ─────────────────────────────────────────────────────────────────────────────
# Bootstrap
# ─────────────────────────────────────────────────────────────────────────────
seed_database() # idempotent — only seeds if tables are empty
_init_redis() # optional — no-op if REDIS_URL not set
# ── Print active-site roster so the operator can verify disabled sites ────────
def _print_active_sites() -> None:
"""
Runs once at startup. Prints every TargetSite row with its enabled status
so the operator can immediately see which sites the scraper will visit and
confirm that any toggled-off sites are genuinely excluded.
"""
db = SessionLocal()
try:
all_sites = db.query(TargetSite).order_by(TargetSite.id).all()
print("\n[GhostNode] 📋 Target Site Roster:")
print( " ┌─────┬──────────┬───────────────────────────────────────────────────────────────┐")
for s in all_sites:
status = "✅ ACTIVE " if s.enabled == 1 else "⏸ DISABLED"
mode = "DIRECT " if "{keyword}" in s.url_template else "HOMEPAGE"
sel_str = f" sel={s.search_selector!r}" if s.search_selector else ""
print(f"{s.id:<3}{status}{s.name:<20} [{mode}] {s.url_template[:40]}{sel_str}")
print( " └─────┴──────────┴───────────────────────────────────────────────────────────────┘\n")
finally:
db.close()
_print_active_sites()
# ── AI Debug Log ─────────────────────────────────────────────────────────────
# Circular buffer holding the last 300 AI call records.
# Written by _ai_log_entry(); read by GET /api/ai/debug/log.
# Active whenever ai_debug = true in config.
_ai_debug_log: collections.deque = collections.deque(maxlen=300)
_ai_debug_log_id: int = 0 # monotonic counter for ordering
_ai_debug_log_lock = threading.Lock()
def _ai_log_entry(entry: dict) -> None:
"""Append one record to the in-memory AI debug ring buffer."""
global _ai_debug_log_id
with _ai_debug_log_lock:
_ai_debug_log_id += 1
entry["id"] = _ai_debug_log_id
entry.setdefault("ts", datetime.utcnow().isoformat(timespec="seconds") + "Z")
_ai_debug_log.append(entry)
# Shared mutable state (thread-safe reads are fine for these primitives)
_stats: dict[str, Any] = {
"total_scanned": 0,
"total_alerts": 0,
"last_cycle": "Never",
"engine_status": "Idle",
"uptime_start": time.time(),
}
# Set by any API write endpoint (add/edit/delete keyword, site, config).
# The scraper loop polls this every 5 s during its inter-cycle sleep and
# wakes up immediately so changes take effect on the very next cycle.
_cycle_now = threading.Event()
# Each agent entry is paired with matching Accept-Language / platform hints
# so the full HTTP header set is internally consistent — detectors check
# that UA, Accept-Language, and navigator.platform all agree.
_agent_profiles: list[dict] = [
{
"ua": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"platform": "Win32", "vendor": "Google Inc.", "lang": "en-US,en;q=0.9",
"locale": "en-US", "tz": "America/New_York",
"viewport": (1920, 1080),
},
{
"ua": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.2420.81",
"platform": "Win32", "vendor": "Google Inc.", "lang": "en-GB,en;q=0.9",
"locale": "en-GB", "tz": "Europe/London",
"viewport": (1440, 900),
},
{
"ua": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0",
"platform": "Win32", "vendor": "Google Inc.", "lang": "en-GB,en;q=0.9",
"locale": "en-GB", "tz": "Europe/London",
"viewport": (1366, 768),
},
{
"ua": "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_4_1) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"platform": "MacIntel", "vendor": "Google Inc.", "lang": "en-US,en;q=0.9",
"locale": "en-US", "tz": "America/Los_Angeles",
"viewport": (1512, 982),
},
{
"ua": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0",
"platform": "Win32", "vendor": "", "lang": "en-US,en;q=0.5",
"locale": "en-US", "tz": "America/Chicago",
"viewport": (1280, 800),
},
]
# Keep a flat UA list for legacy callers (price refresh loop etc.)
_rotating_agents: list[str] = [p["ua"] for p in _agent_profiles]
# ─────────────────────────────────────────────────────────────────────────────
# Humanisation helpers
# ─────────────────────────────────────────────────────────────────────────────
def _jitter(base: float, pct: float = 0.35) -> float:
"""Return base ± pct%, minimum 0.3 s."""
spread = base * pct
return max(0.3, base + random.uniform(-spread, spread))
def _bezier_points(x0: int, y0: int, x1: int, y1: int, n: int) -> list[tuple[int,int]]:
"""
Generate n points along a cubic bezier curve between (x0,y0) and (x1,y1).
Two random control points give the path a natural arc rather than a
straight line — straight-line mouse movements are a strong bot signal.
"""
cx1 = x0 + random.randint(-120, 120)
cy1 = y0 + random.randint(-60, 60)
cx2 = x1 + random.randint(-120, 120)
cy2 = y1 + random.randint(-60, 60)
pts = []
for i in range(n):
t = i / (n - 1) if n > 1 else 0
mt = 1 - t
px = int(mt**3*x0 + 3*mt**2*t*cx1 + 3*mt*t**2*cx2 + t**3*x1)
py = int(mt**3*y0 + 3*mt**2*t*cy1 + 3*mt*t**2*cy2 + t**3*y1)
pts.append((px, py))
return pts
async def _human_mouse(page) -> None:
"""
Heavy human mouse simulation.
- Bezier-curve paths (not straight lines)
- Variable speed: slow at start/end, faster in the middle (ease-in-out)
- Random hover-pauses over elements as if reading them
- Occasional micro-tremors (hand shake)
- 5-9 movements total before scraping begins
"""
try:
vw = await page.evaluate("window.innerWidth")
vh = await page.evaluate("window.innerHeight")
x = random.randint(int(vw * 0.25), int(vw * 0.55))
y = random.randint(int(vh * 0.15), int(vh * 0.35))
for move_n in range(random.randint(5, 9)):
# Pick a target in the "content area" (not nav bars)
tx = random.randint(int(vw * 0.05), int(vw * 0.90))
ty = random.randint(int(vh * 0.10), int(vh * 0.85))
pts = _bezier_points(x, y, tx, ty, random.randint(18, 35))
for idx, (px, py) in enumerate(pts):
# Micro-tremor: tiny random offset every step
jx = px + random.randint(-2, 2)
jy = py + random.randint(-2, 2)
jx = max(1, min(vw - 1, jx))
jy = max(1, min(vh - 1, jy))
await page.mouse.move(jx, jy)
# Ease-in-out speed: slow at edges of path, fast in middle
progress = idx / len(pts)
speed = 0.5 - 0.4 * abs(progress - 0.5) * 2 # ~0.1 at ends, ~0.5 mid
await asyncio.sleep(random.uniform(speed * 0.015, speed * 0.045))
x, y = tx, ty
# After each movement: sometimes hover and "read" for a moment
if random.random() < 0.55:
await asyncio.sleep(random.uniform(0.3, 1.4))
# Occasionally: brief fast micro-movements (thinking/fidgeting)
if random.random() < 0.25:
for _ in range(random.randint(3, 7)):
await page.mouse.move(
x + random.randint(-8, 8),
y + random.randint(-8, 8),
)
await asyncio.sleep(random.uniform(0.02, 0.06))
await asyncio.sleep(random.uniform(0.05, 0.25))
except Exception:
pass # never let mouse errors kill the scrape
async def _human_scroll(page, steps: int = 5) -> None:
"""
Heavy human scroll simulation.
- Variable scroll distances (people don't scroll the same amount each time)
- Longer read-pauses mid-page (as if reading a listing title)
- Occasional scroll-back-up to re-check something
- Final scroll back toward top to simulate "I've seen enough" behaviour
- Scroll wheel events (mousewheel) not just JS scrollBy
"""
try:
vh = await page.evaluate("window.innerHeight")
total_scrolled = 0
for step in range(steps):
# Variable scroll: sometimes a quick skim, sometimes a slow read
dist = int(vh * random.uniform(0.30, 0.85))
# Use actual mouse wheel scroll — more realistic than JS scrollBy
# Wheel delta in px — browsers normalise this but detectors see it
await page.mouse.wheel(0, dist)
total_scrolled += dist
await asyncio.sleep(random.uniform(0.15, 0.35))
# Random read-pause: longer stop as if reading a result
if random.random() < 0.65:
await asyncio.sleep(random.uniform(0.6, 2.2))
# Occasional scroll-back-up (re-reading behaviour)
if random.random() < 0.30 and step > 0:
back = int(dist * random.uniform(0.25, 0.65))
await page.mouse.wheel(0, -back)
total_scrolled -= back
await asyncio.sleep(random.uniform(0.4, 1.0))
# Final: scroll back toward top (user finished scanning, ready to act)
if total_scrolled > vh:
await asyncio.sleep(random.uniform(0.5, 1.2))
# Scroll back in 2-3 steps rather than teleporting to top
scroll_back = int(total_scrolled * random.uniform(0.4, 0.75))
for _ in range(random.randint(2, 3)):
chunk = scroll_back // 2
await page.mouse.wheel(0, -chunk)
await asyncio.sleep(random.uniform(0.2, 0.5))
except Exception:
pass
def _build_stealth_script(profile: dict) -> str:
"""
Returns a comprehensive JS init script that patches 30+ navigator/
window properties checked by Cloudflare, DataDome, PerimeterX, and
similar bot-detection systems.
All patches use Object.defineProperty so they cannot be overwritten
by the site's own JS after the fact.
"""
platform = profile.get("platform", "Win32")
vendor = profile.get("vendor", "Google Inc.")
lang = profile.get("lang", "en-US,en;q=0.9")
# Accept-Language first token
lang0 = lang.split(",")[0].strip()
return f"""
(() => {{
// ── 1. Core webdriver / automation flags ──────────────────────────────
const def = (obj, prop, val) => {{
try {{ Object.defineProperty(obj, prop, {{ get: () => val, configurable: true }}); }}
catch(e) {{}}
}};
def(navigator, 'webdriver', undefined);
def(navigator, 'plugins', [
{{ name:'PDF Viewer', filename:'internal-pdf-viewer', description:'Portable Document Format', length:1 }},
{{ name:'Chrome PDF Viewer', filename:'internal-pdf-viewer', description:'Portable Document Format', length:1 }},
{{ name:'Chromium PDF Viewer',filename:'internal-pdf-viewer', description:'Portable Document Format', length:1 }},
{{ name:'Microsoft Edge PDF Viewer',filename:'internal-pdf-viewer',description:'Portable Document Format', length:1 }},
{{ name:'WebKit built-in PDF',filename:'internal-pdf-viewer', description:'Portable Document Format', length:1 }},
]);
def(navigator, 'languages', ['{lang0}', '{lang0.split("-")[0]}']);
def(navigator, 'platform', '{platform}');
def(navigator, 'vendor', '{vendor}');
def(navigator, 'hardwareConcurrency', {random.choice([4, 6, 8, 12, 16])});
def(navigator, 'deviceMemory', {random.choice([4, 8, 16])});
def(navigator, 'maxTouchPoints', 0);
def(navigator, 'cookieEnabled', true);
def(navigator, 'onLine', true);
def(navigator, 'doNotTrack', null);
// ── 2. Chrome runtime object — must be non-trivially populated ────────
if (!window.chrome) window.chrome = {{}};
window.chrome.runtime = window.chrome.runtime || {{
id: undefined,
connect: () => {{}},
sendMessage: () => {{}},
onMessage: {{ addListener: () => {{}} }},
}};
window.chrome.loadTimes = () => ({{
requestTime: Date.now() / 1000 - Math.random() * 0.3,
startLoadTime: Date.now() / 1000 - Math.random() * 0.2,
commitLoadTime: Date.now() / 1000 - Math.random() * 0.1,
finishDocumentLoadTime: Date.now() / 1000,
finishLoadTime: Date.now() / 1000,
firstPaintTime: Date.now() / 1000 - Math.random() * 0.05,
firstPaintAfterLoadTime: 0,
navigationType: 'Other',
wasFetchedViaSpdy: true,
wasNpnNegotiated: true,
npnNegotiatedProtocol: 'h2',
wasAlternateProtocolAvailable: false,
connectionInfo: 'h2',
}});
window.chrome.csi = () => ({{
startE: Date.now() - Math.floor(Math.random()*2000+500),
onloadT: Date.now() - Math.floor(Math.random()*200),
pageT: Math.random()*2000+300,
tran: 15,
}});
// ── 3. Permissions API — real browser returns 'granted'/'prompt' ──────
if (navigator.permissions) {{
const _query = navigator.permissions.query.bind(navigator.permissions);
navigator.permissions.query = (params) =>
params.name === 'notifications'
? Promise.resolve({{ state: Notification.permission }})
: _query(params);
}}
// ── 4. WebGL renderer — headless Chrome returns "SwiftShader" ─────────
// Real GPU names: "ANGLE (Intel, Mesa Intel(R) UHD...)" etc.
const _getCtx = HTMLCanvasElement.prototype.getContext;
HTMLCanvasElement.prototype.getContext = function(type, ...args) {{
const ctx = _getCtx.call(this, type, ...args);
if (type === 'webgl' || type === 'webgl2') {{
const _getPara = ctx.getParameter.bind(ctx);
ctx.getParameter = function(param) {{
if (param === 37445) return 'Intel Inc.';
if (param === 37446) return 'ANGLE (Intel, Intel(R) UHD Graphics 630 Direct3D11 vs_5_0 ps_5_0, D3D11)';
return _getPara(param);
}};
}}
return ctx;
}};
// ── 5. Canvas fingerprint noise — tiny random pixel perturbation ──────
const _toDataURL = HTMLCanvasElement.prototype.toDataURL;
HTMLCanvasElement.prototype.toDataURL = function(type) {{
const ctx2 = this.getContext('2d');
if (ctx2) {{
const img = ctx2.getImageData(0, 0, 1, 1);
img.data[0] ^= {random.randint(1, 8)};
ctx2.putImageData(img, 0, 0);
}}
return _toDataURL.call(this, type);
}};
// ── 6. Audio fingerprint noise ────────────────────────────────────────
const _createBuffer = AudioBuffer.prototype.getChannelData;
if (_createBuffer) {{
AudioBuffer.prototype.getChannelData = function(ch) {{
const data = _createBuffer.call(this, ch);
for (let i = 0; i < data.length; i += 1000)
data[i] += Math.random() * 0.0000001;
return data;
}};
}}
// ── 7. Screen / window — match viewport ───────────────────────────────
def(screen, 'width', window.innerWidth || {random.choice([1366,1440,1920])});
def(screen, 'height', window.innerHeight || {random.choice([768,900,1080])});
def(screen, 'availWidth', window.innerWidth || {random.choice([1366,1440,1920])});
def(screen, 'availHeight', (window.innerHeight || {random.choice([768,900,1080])}) - 40);
def(screen, 'colorDepth', 24);
def(screen, 'pixelDepth', 24);
def(window, 'devicePixelRatio', 1);
def(window, 'outerWidth', window.innerWidth);
def(window, 'outerHeight', window.innerHeight + {random.randint(85,110)});
// ── 8. Timing API — real browsers have nonzero connection timings ─────
if (window.PerformanceTiming) {{
const _now = Date.now();
const t = performance.timing;
['navigationStart','unloadEventStart','unloadEventEnd',
'redirectStart','redirectEnd','fetchStart','domainLookupStart',
'domainLookupEnd','connectStart','connectEnd','requestStart',
'responseStart','responseEnd','domLoading','domInteractive',
'domContentLoadedEventStart','domContentLoadedEventEnd',
'domComplete','loadEventStart','loadEventEnd'].forEach((k,i) => {{
try {{ Object.defineProperty(t, k, {{ get: () => _now - (19-i)*{random.randint(8,25)} }}); }}
catch(e) {{}}
}});
}}
// ── 9. Focus / visibility lock ────────────────────────────────────────
def(document, 'visibilityState', 'visible');
def(document, 'hidden', false);
document.hasFocus = () => true;
document.addEventListener('visibilitychange',
e => e.stopImmediatePropagation(), true);
// ── 10. iframe contentWindow.navigator.webdriver ─────────────────────
const _attach = HTMLIFrameElement.prototype.attachShadow;
try {{
Object.defineProperty(HTMLIFrameElement.prototype, 'contentWindow', {{
get: function() {{
const w = Object.getOwnPropertyDescriptor(
HTMLIFrameElement.prototype, 'contentWindow'
).get.call(this);
if (w && w.navigator)
def(w.navigator, 'webdriver', undefined);
return w;
}}
}});
}} catch(e) {{}}
// ── 11. Battery API ──────────────────────────────────────────────────
if (navigator.getBattery) {{
navigator.getBattery = () => Promise.resolve({{
charging: true, chargingTime: 0,
dischargingTime: Infinity, level: 0.77,
addEventListener: () => {{}},
}});
}}
// ── 12. Network Info ─────────────────────────────────────────────────
if (navigator.connection) {{
def(navigator.connection, 'rtt', 20);
def(navigator.connection, 'downlink', 25);
def(navigator.connection, 'effectiveType', '4g');
def(navigator.connection, 'saveData', false);
}}
// ── 13. Media devices ────────────────────────────────────────────────
if (navigator.mediaDevices && navigator.mediaDevices.enumerateDevices) {{
const _enumDev = navigator.mediaDevices.enumerateDevices
.bind(navigator.mediaDevices);
navigator.mediaDevices.enumerateDevices = () =>
_enumDev().then(d => d.length ? d : [
{{ kind:'audioinput', deviceId:'default', label:'', groupId:'' }},
{{ kind:'audiooutput', deviceId:'default', label:'', groupId:'' }},
]);
}}
// ── 14. Browser-specific: Yandex / Edge / Brave ──────────────────────
(function() {{
const ua = navigator.userAgent || '';
if (ua.includes('YaBrowser')) {{
window.yandex = window.yandex || {{}};
window.Ya = window.Ya || {{}};
def(navigator, 'vendor', 'Yandex');
}}
if (ua.includes('Edg/') && !ua.includes('Edge/')) {{
window.msWriteProfilerMark =
window.msWriteProfilerMark || (() => {{}});
}}
if (ua.includes('Brave')) {{
navigator.brave = {{ isBrave: () => Promise.resolve(true) }};
}}
}})();
}})();
"""
# ─────────────────────────────────────────────────────────────────────────────
# Browser resolver — Edge → Yandex → Chromium fallback
# ─────────────────────────────────────────────────────────────────────────────
def _resolve_browser() -> tuple[str, str]:
"""
Locate the browser binary to use for scraping.
Priority:
1. Read 'browser_choice' from the DB (set via Settings tab).
Choices: 'auto' | 'chrome' | 'edge' | 'yandex' | 'brave'
2. If choice is 'auto' (or unset), probe Edge → Yandex → Chromium.
3. For an explicit choice, search known install paths + PATH.
Falls back to Playwright Chromium if the chosen browser isn't found.
Both Edge, Yandex, Brave, and Chrome are Chromium-based, so Playwright
drives all of them via pw.chromium.launch(executable_path=...).
Returns (browser_label, executable_path_or_empty_string).
An empty path tells Playwright to use its own managed Chromium.
"""
import shutil
choice = _get_config("browser_choice", "auto").strip().lower()
# ── Candidate path tables ─────────────────────────────────────────────────
BROWSERS: dict[str, dict] = {
"chrome": {
"label": "Google Chrome",
"paths": [
r"C:\Program Files\Google\Chrome\Application\chrome.exe",
r"C:\Program Files (x86)\Google\Chrome\Application\chrome.exe",
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
"/usr/bin/google-chrome",
"/usr/bin/google-chrome-stable",
"/usr/bin/chromium-browser",
],
"which": ["google-chrome", "google-chrome-stable", "chromium-browser", "chrome"],
},
"edge": {
"label": "Microsoft Edge",
"paths": [
r"C:\Program Files (x86)\Microsoft\Edge\Application\msedge.exe",
r"C:\Program Files\Microsoft\Edge\Application\msedge.exe",
"/Applications/Microsoft Edge.app/Contents/MacOS/Microsoft Edge",
"/usr/bin/microsoft-edge",
"/usr/bin/microsoft-edge-stable",
],
"which": ["msedge", "microsoft-edge", "microsoft-edge-stable"],
},
"yandex": {
"label": "Yandex Browser",
"paths": [
rf"C:\Users\{os.environ.get('USERNAME', os.environ.get('USER', ''))}\AppData\Local\Yandex\YandexBrowser\Application\browser.exe",
r"C:\Program Files\Yandex\YandexBrowser\Application\browser.exe",
"/Applications/Yandex.app/Contents/MacOS/Yandex",
"/usr/bin/yandex-browser",
"/usr/bin/yandex-browser-stable",
],
"which": ["yandex-browser", "yandex-browser-stable"],
},
"brave": {
"label": "Brave Browser",
"paths": [
r"C:\Program Files\BraveSoftware\Brave-Browser\Application\brave.exe",
r"C:\Program Files (x86)\BraveSoftware\Brave-Browser\Application\brave.exe",
rf"C:\Users\{os.environ.get('USERNAME', os.environ.get('USER', ''))}\AppData\Local\BraveSoftware\Brave-Browser\Application\brave.exe",
"/Applications/Brave Browser.app/Contents/MacOS/Brave Browser",
"/usr/bin/brave-browser",
"/usr/bin/brave-browser-stable",
"/usr/bin/brave",
],
"which": ["brave-browser", "brave-browser-stable", "brave"],
},
}
def _find(key: str) -> tuple[str, str]:
"""Return (label, exe) for a named browser, or ('', '') if not found."""
entry = BROWSERS.get(key, {})
for path in entry.get("paths", []):
if os.path.isfile(path):
return (entry["label"], path)
for cmd in entry.get("which", []):
exe = shutil.which(cmd)
if exe:
return (entry["label"], exe)
return ("", "")
# ── Explicit choice ───────────────────────────────────────────────────────
if choice in BROWSERS:
label, exe = _find(choice)
if exe:
print(f"[Browser] ✅ {label} selected & found → {exe}")
return (label, exe)
else:
print(
f"[Browser] ⚠️ '{choice}' was selected in Settings but is not "
f"installed on this machine.\n"
f" Falling back to Playwright's managed Chromium.\n"
f" Install {BROWSERS[choice]['label']} and restart Ghost Node."
)
return ("Playwright Chromium", "")
# ── Auto-detect: Edge → Yandex → Chrome → Brave → Chromium ──────────────
print("[Browser] 🔍 Auto-detecting browser (Edge → Yandex → Chrome → Brave)…")
for key in ("edge", "yandex", "chrome", "brave"):
label, exe = _find(key)
if exe:
print(f"[Browser] ✅ Auto-selected: {label}{exe}")
return (label, exe)
print(
"[Browser] ⚠️ No supported browser found. Using Playwright Chromium.\n"
" Install Edge, Yandex, Chrome, or Brave for a real browser experience."
)
return ("Playwright Chromium", "")
# Browser is resolved fresh each cycle inside nuclear_engine so Settings
# changes take effect on the next cycle without restarting Ghost Node.
# ─────────────────────────────────────────────────────────────────────────────
# Telegram helpers
# ─────────────────────────────────────────────────────────────────────────────
def _get_config(key: str, default: str = "") -> str:
db = SessionLocal()
try:
row = db.query(Config).filter(Config.key == key).first()
return row.value if row and row.value else default
finally:
db.close()
# ── N1 Proxy rotation ──────────────────────────────────────────────────────
class _RoundRobin:
"""Thread-safe round-robin counter for proxy rotation."""
def __init__(self): self._v = 0; self._lock = threading.Lock()
def get(self) -> int:
with self._lock: return self._v
def increment(self) -> None:
with self._lock: self._v += 1
_proxy_counter = _RoundRobin()
def _get_proxy() -> dict | None:
"""
Returns a Playwright proxy dict if proxy_enabled=true and proxy_list has entries.
Rotates through the list round-robin using a module-level counter.
Proxy URLs should be in the format: http://host:port or http://user:pass@host:port
Returns None if proxy is disabled or list is empty.
"""
if _get_config("proxy_enabled", "false").lower() != "true":
return None
raw = _get_config("proxy_list", "").strip()
if not raw:
return None
proxies = [p.strip() for p in raw.splitlines() if p.strip()]
if not proxies:
return None
# Round-robin rotation using a shared counter
idx = _proxy_counter.get() % len(proxies)
_proxy_counter.increment()
chosen = proxies[idx]
# Parse proxy URL into Playwright format
# Playwright expects: {"server": "http://host:port", "username": "...", "password": "..."}
import urllib.parse as _up
parsed = _up.urlparse(chosen)
proxy: dict = {"server": f"{parsed.scheme}://{parsed.hostname}:{parsed.port}"}
if parsed.username:
proxy["username"] = parsed.username
if parsed.password:
proxy["password"] = parsed.password
return proxy
async def send_telegram(message: str) -> bool:
"""
Pull token + chat_id fresh from the DB on every call so Settings-tab
changes are immediately active without a restart.
Error handling tiers:
HTTP 400 → 'chat not found' means the bot has no open session with
the user. Print an ACTION REQUIRED banner with exact steps.
HTTP 401 → Bad token — print the token prefix for comparison.
HTTP 4xx → Any other client error — print full Telegram JSON body.
Timeout / network → log and return False without raising so the
scraper continues to the next lot uninterrupted.
"""
token = _get_config("telegram_token")
chat_id = _get_config("telegram_chat_id")
if not token or not chat_id:
print("[Telegram] ⚠️ No token/chat_id in DB — save Settings first.")
return False
url = f"https://api.telegram.org/bot{token}/sendMessage"
try:
async with httpx.AsyncClient(timeout=15) as client:
r = await client.post(
url,
data={"chat_id": chat_id, "text": message, "parse_mode": "HTML"},
)
if r.status_code == 200:
print(f"[Telegram] ✅ Alert sent to chat {chat_id}")
return True
# ── Parse the JSON error body Telegram always returns ────────────
try:
err_body = r.json()
except Exception:
err_body = {"description": r.text}
description = err_body.get("description", "").lower()
if r.status_code == 400 and "chat not found" in description:
# ── Actionable guidance for the most common setup mistake ─────
print(
f"\n[Telegram] ❌ HTTP 400 — chat not found\n"
f" Chat ID in DB : {chat_id}\n"
f" ──────────────────────────────────────────────\n"
f" ACTION REQUIRED — three steps to fix this:\n"
f" 1. Open Telegram and find your bot.\n"
f" 2. Press START or send /start to it.\n"
f" (Bots cannot message you first — you must\n"
f" open the conversation from your side.)\n"
f" 3. If using a group, the Chat ID must begin\n"
f" with a minus sign, e.g. -100123456789.\n"
f" ──────────────────────────────────────────────\n"
f" Telegram raw: {err_body}\n"
)
elif r.status_code == 401:
print(
f"[Telegram] ❌ HTTP 401 Unauthorized\n"
f" Token prefix in DB: '{token[:12]}'\n"
f" ACTION REQUIRED: Verify the Bot Token in Settings.\n"
f" Telegram raw: {err_body}"
)
else:
# Generic non-200 — print everything for easy debugging
print(
f"[Telegram] ❌ HTTP {r.status_code}"
f"token='{token[:12]}' chat='{chat_id}'\n"
f" Telegram says: {err_body}"
)
return False
except httpx.TimeoutException:
# ── Never let a Telegram timeout crash the scraper loop ──────────────
print("[Telegram] ❌ Request timed out — check network. Scraper continues.")
return False
except httpx.RequestError as exc:
print(f"[Telegram] ❌ Network error: {exc}. Scraper continues.")
return False
except Exception as exc:
print(f"[Telegram] ❌ Unexpected error: {type(exc).__name__}: {exc}. Scraper continues.")
return False
# ─────────────────────────────────────────────────────────────────────────────
# N10 — Multi-channel alert dispatcher
# ─────────────────────────────────────────────────────────────────────────────
async def _send_discord(message: str) -> bool:
"""Send a message to a Discord webhook URL."""
webhook = _get_config("discord_webhook")
if not webhook:
print("[Discord] ⚠️ No webhook URL saved.")
return False
# Discord uses 'content' not Telegram HTML — strip basic HTML tags
plain = re.sub(r"<[^>]+>", "", message).strip()
try:
async with httpx.AsyncClient(timeout=15) as client:
r = await client.post(webhook, json={"content": plain[:2000]})
if r.status_code in (200, 204):
print("[Discord] ✅ Alert sent.")
return True
print(f"[Discord] ❌ HTTP {r.status_code}: {r.text[:200]}")
return False
except Exception as exc:
print(f"[Discord] ❌ {exc}")
return False
async def _send_email(message: str, subject: str = "Ghost Node Alert") -> bool:
"""Send an alert via Gmail using an App Password.
Requires: gmail_address + gmail_app_password + email_to in Config.
Get an App Password at: myaccount.google.com/apppasswords
(requires 2-Step Verification to be enabled on your Google account).
"""
import smtplib
from email.mime.text import MIMEText
gmail_addr = _get_config("gmail_address", "").strip()
app_pass = _get_config("gmail_app_password", "").strip()
to = _get_config("email_to", "").strip()
if not all([gmail_addr, app_pass, to]):
print("[Email] ⚠️ Gmail not configured — set gmail_address, gmail_app_password, email_to in Settings.")
return False
plain = re.sub(r"<[^>]+>", "", message).strip()
def _do_send():
msg = MIMEText(plain, "plain", "utf-8")
msg["Subject"] = subject
msg["From"] = gmail_addr
msg["To"] = to
with smtplib.SMTP("smtp.gmail.com", 587, timeout=20) as s:
s.starttls()
s.login(gmail_addr, app_pass)
s.sendmail(gmail_addr, [to], msg.as_string())
try:
loop = asyncio.get_event_loop()
await loop.run_in_executor(None, _do_send)
print(f"[Email] ✅ Alert sent to {to}")
return True
except Exception as exc:
print(f"[Email] ❌ {exc}")
return False
async def send_alert(message: str, subject: str = "Ghost Node Alert") -> None:
"""
Route an alert to all channels the user has enabled.
Channels are a comma-separated list in Config: e.g. "telegram,discord"
Each channel is tried independently — one failure doesn't stop others.
"""
channels_raw = _get_config("alert_channels", "telegram")
channels = [c.strip().lower() for c in channels_raw.split(",") if c.strip()]
tasks = []
for ch in channels:
if ch == "telegram":
tasks.append(send_telegram(message))
elif ch == "discord":
tasks.append(_send_discord(message))
elif ch == "email":
tasks.append(_send_email(message, subject))
if tasks:
await asyncio.gather(*tasks, return_exceptions=True)
async def get_telegram_updates(offset: int) -> list[dict]:
token = _get_config("telegram_token")
if not token:
return []
url = f"https://api.telegram.org/bot{token}/getUpdates"
try:
async with httpx.AsyncClient(timeout=20) as client:
r = await client.get(url, params={"offset": offset, "timeout": 10})
if r.status_code == 200:
return r.json().get("result", [])
else:
print(f"[Telegram C2] ❌ getUpdates HTTP {r.status_code}: {r.text}")
except Exception as exc:
print(f"[Telegram C2] ❌ getUpdates error: {exc}")
return []
# ─────────────────────────────────────────────────────────────────────────────
# Thread C — Telegram C2 Polling
# ─────────────────────────────────────────────────────────────────────────────
async def telegram_c2_loop() -> None:
offset = 0
print("[Thread C] Telegram C2 online.")
while True:
try:
updates = await get_telegram_updates(offset)
for upd in updates:
offset = upd["update_id"] + 1
msg = upd.get("message", {})
text = msg.get("text", "").strip()
chat_id_upd = msg.get("chat", {}).get("id")
if not chat_id_upd:
continue
if text == "/status":
uptime_secs = int(time.time() - _stats["uptime_start"])
h, rem = divmod(uptime_secs, 3600)
m, s = divmod(rem, 60)
report = (
"🕵️ <b>Ghost Node — Health Report</b>\n"
f"━━━━━━━━━━━━━━━━━━━━\n"
f"🟢 Engine: {_stats['engine_status']}\n"
f"📡 Scanned: {_stats['total_scanned']} listings\n"
f"🚨 Alerts sent: {_stats['total_alerts']}\n"
f"🔄 Last cycle: {_stats['last_cycle']}\n"
f"⏱️ Uptime: {h:02d}h {m:02d}m {s:02d}s\n"
f"🔀 Proxy: {'ON' if _get_config('proxy_enabled','false').lower()=='true' else 'OFF'}\n"
f"🖥️ Host OS: {platform.system()} {platform.release()}"
)
await send_telegram(report)
elif text == "/pause":
_stats["engine_status"] = "Paused"
await send_telegram("⏸️ Engine paused.")
elif text == "/resume":
_stats["engine_status"] = "Running"
await send_telegram("▶️ Engine resumed.")
elif text == "/listings":
db = SessionLocal()
try:
rows = db.query(Listing).order_by(Listing.timestamp.desc()).limit(5).all()
if rows:
lines = "\n".join(
f"{r.title[:40]} — £{r.price or '?'} (score {r.score})"
for r in rows
)
await send_telegram(f"📋 <b>Last 5 Listings:</b>\n{lines}")
else:
await send_telegram("No listings found yet.")
finally:
db.close()
elif text == "/top5":
db = SessionLocal()
try:
rows = db.query(Listing).order_by(Listing.score.desc(), Listing.timestamp.desc()).limit(5).all()
if rows:
lines = []
for i, r in enumerate(rows, 1):
price_str = f"{r.currency or ''}{r.price:.0f}" if r.price else "?"
lines.append(f"{i}. {r.title[:45]}\n 💰 {price_str} | ⭐ {r.score} | 🌐 {r.site_name or '?'}")
await send_telegram("🏆 <b>Top 5 by Score:</b>\n\n" + "\n\n".join(lines))
else:
await send_telegram("No listings found yet.")
finally:
db.close()
elif text == "/sites":
db = SessionLocal()
try:
sites = db.query(TargetSite).all()
if sites:
lines = []
for s in sites:
status = "🟢" if s.enabled == 1 else "🔴"
health = f"⚠️ {s.consecutive_failures} fails" if (s.consecutive_failures or 0) >= 3 else ""
lines.append(f"{status} {s.name}{health}")
await send_telegram("🌐 <b>Target Sites:</b>\n" + "\n".join(lines))
else:
await send_telegram("No sites configured.")
finally:
db.close()
elif text.startswith("/alert "):
# /alert on <keyword> or /alert off <keyword>
parts = text.split(" ", 2)
if len(parts) == 3 and parts[1].lower() in ("on", "off"):
action, kw_term = parts[1].lower(), parts[2].strip()
db = SessionLocal()
try:
kw = db.query(Keyword).filter(Keyword.term.ilike(kw_term)).first()
if kw:
kw.weight = abs(kw.weight) if action == "on" else -abs(kw.weight)
db.flush()
db.commit()
await send_telegram(f"✅ Keyword '<b>{kw.term}</b>' alerts turned <b>{action.upper()}</b>.")
else:
await send_telegram(f"❌ Keyword '<b>{kw_term}</b>' not found. Use /keywords to list all.")
finally:
db.close()
else:
await send_telegram("Usage: /alert on &lt;keyword&gt; or /alert off &lt;keyword&gt;")
elif text == "/keywords":
db = SessionLocal()
try:
kws = db.query(Keyword).all()
if kws:
lines = [f"{'🟢' if (k.weight or 0) > 0 else '🔴'} {k.term} (weight {k.weight})" for k in kws]
await send_telegram("🔍 <b>Keywords:</b>\n" + "\n".join(lines))
else:
await send_telegram("No keywords configured.")
finally:
db.close()
elif text == "/help":
await send_telegram(
"🕵️ <b>Ghost Node Commands:</b>\n"
"/status — engine health\n"
"/listings — last 5 captures\n"
"/top5 — top 5 by score\n"
"/sites — site health\n"
"/keywords — all keywords\n"
"/alert on &lt;kw&gt; — re-enable keyword\n"
"/alert off &lt;kw&gt; — mute keyword\n"
"/pause — pause engine\n"
"/resume — resume engine"
)
except Exception as exc:
print(f"[Thread C] Error: {exc}")
await asyncio.sleep(3)
def timeLeftToMins(tl: str) -> float:
"""
Convert '2d 4h 30m 45s' string to total minutes as a float.
Includes seconds so the countdown is accurate to the second.
Returns float('inf') if nothing parseable (no time data scraped).
"""
if not tl or not tl.strip():
return float("inf")
mins = 0.0
import re as _re
d = _re.search(r"(\d+)\s*d", tl)
if d: mins += int(d.group(1)) * 1440
h = _re.search(r"(\d+)\s*h", tl)
if h: mins += int(h.group(1)) * 60
m = _re.search(r"(\d+)\s*m(?!s)", tl) # 'm' but not 'ms'
if m: mins += int(m.group(1))
s = _re.search(r"(\d+)\s*s(?!\w)", tl) # bare seconds
if s: mins += int(s.group(1)) / 60.0
return mins if mins > 0 else float("inf")
# ─────────────────────────────────────────────────────────────────────────────
# Thread B — Nuclear Scraper Engine
# ─────────────────────────────────────────────────────────────────────────────
# Currency symbol / code → ISO code map
_CURRENCY_MAP: list[tuple[str, str]] = [
("CA$", "CAD"), # must come before bare "$"
("CAD", "CAD"),
("US$", "USD"),
("USD", "USD"),
("$", "USD"), # default bare $ → USD
("£", "GBP"),
("GBP", "GBP"),
("", "EUR"),
("EUR", "EUR"),
("AU$", "AUD"),
("AUD", "AUD"),
("NZ$", "NZD"),
("NZD", "NZD"),
("CHF", "CHF"),
("SEK", "SEK"),
("NOK", "NOK"),
("DKK", "DKK"),
("JPY", "JPY"),
("¥", "JPY"),
("CNY", "CNY"),
("HKD", "HKD"),
("MXN", "MXN"),
("BRL", "BRL"),
("INR", "INR"),
("", "INR"),
]
# ─────────────────────────────────────────────────────────────────────────────
# N16 — AI Filter Engine (Groq + Ollama)
# ─────────────────────────────────────────────────────────────────────────────
def _build_ai_prompt(title: str, ai_target: str, description: str = "") -> str:
"""
Build a compact, token-efficient prompt for lot classification.
Includes lot description when available (N18) for richer AI decisions.
We keep it short to maximise Groq free-tier token budget.
"""
desc_block = f"Lot description: {description[:600]}\n" if description and description.strip() else ""
return (
"You are an auction lot classifier. Decide if this lot matches what the user wants.\n"
"Rules:\n"
" - Only return YES or NO followed by a colon and a very short reason (max 12 words).\n"
" - Be strict. Accessories, cases, chargers, covers, screen protectors, keyboards, "
"manuals, boxes, cables, stands, docks, and any non-device items are always NO.\n"
" - If the lot title is vague or unclear, use the description to clarify. If still unclear, return NO.\n"
" - Format exactly: YES: reason OR NO: reason\n\n"
f"User wants: {ai_target}\n"
f"Lot title: {title}\n"
+ desc_block
+ "Answer:"
)
def _ai_debug_enabled() -> bool:
"""Returns True if ai_debug config key is set to 'true'."""
return _get_config("ai_debug", "false").strip().lower() == "true"
def _ai_debug_print(tag: str, label: str, content: str, max_chars: int = 2000) -> None:
"""Print a clearly-framed AI debug block to console."""
sep = "" * 60
print(f"\n[{tag}] ┌{sep}")
print(f"[{tag}] │ {label}")
print(f"[{tag}] ├{sep}")
for line in content[:max_chars].splitlines():
print(f"[{tag}] │ {line}")
if len(content) > max_chars:
print(f"[{tag}] │ ... (truncated — {len(content)} chars total)")
print(f"[{tag}] └{sep}\n")
async def _ai_call_groq(prompt: str, api_key: str, model: str, _ctx: dict | None = None) -> tuple[bool, str]:
"""Call Groq's OpenAI-compatible free API. Returns (match, reason).
_ctx: optional metadata dict injected into the debug log entry (e.g. title, call_type).
"""
if not api_key:
print("[AI] ⚠️ Groq API key not set — skipping AI filter.")
return True, "no-key"
_debug = _ai_debug_enabled()
_resolved_model = model or "llama-3.3-70b-versatile"
if _debug:
_ai_debug_print("AI-DEBUG", f"GROQ REQUEST model={_resolved_model}", prompt)
_ai_log_entry({
"call_type": (_ctx or {}).get("call_type", "filter"),
"direction": "request",
"provider": "groq",
"model": _resolved_model,
"content": prompt,
"title": (_ctx or {}).get("title"),
"site": (_ctx or {}).get("site"),
})
try:
async with httpx.AsyncClient(timeout=20) as client:
r = await client.post(
"https://api.groq.com/openai/v1/chat/completions",
headers={
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json",
},
json={
"model": _resolved_model,
"messages": [{"role": "user", "content": prompt}],
"max_tokens": 40,
"temperature": 0.0,
},
)
if r.status_code == 200:
rj = r.json()
raw_text = rj["choices"][0]["message"]["content"].strip()
usage = rj.get("usage", {})
tok_p = usage.get("prompt_tokens")
tok_c = usage.get("completion_tokens")
if _debug:
_ai_debug_print("AI-DEBUG", f"GROQ RESPONSE (tokens: prompt={tok_p} completion={tok_c})", raw_text)
match_v = raw_text.upper().startswith("YES")
_ai_log_entry({
"call_type": (_ctx or {}).get("call_type", "filter"),
"direction": "response",
"provider": "groq",
"model": _resolved_model,
"content": raw_text,
"tokens_prompt": tok_p,
"tokens_completion": tok_c,
"verdict": "YES" if match_v else "NO",
"title": (_ctx or {}).get("title"),
"site": (_ctx or {}).get("site"),
})
match = raw_text.upper().startswith("YES")
reason = raw_text.split(":", 1)[1].strip() if ":" in raw_text else raw_text[:80]
return match, reason[:200]
elif r.status_code == 429:
print("[AI] ⚠️ Groq rate limit hit — passing lot through.")
if _debug:
_ai_debug_print("AI-DEBUG", "GROQ RATE LIMIT RESPONSE (429)", r.text)
_ai_log_entry({"call_type": "filter", "direction": "error", "provider": "groq",
"model": _resolved_model, "content": "429 Rate limit hit", "status_code": 429})
return True, "rate-limit"
else:
print(f"[AI] ❌ Groq HTTP {r.status_code}: {r.text[:200]}")
if _debug:
_ai_debug_print("AI-DEBUG", f"GROQ ERROR RESPONSE ({r.status_code})", r.text)
_ai_log_entry({"call_type": "filter", "direction": "error", "provider": "groq",
"model": _resolved_model, "content": r.text[:500], "status_code": r.status_code})
return True, f"api-error-{r.status_code}"
except Exception as exc:
print(f"[AI] ❌ Groq call failed: {exc}")
if _debug:
_ai_log_entry({"call_type": "filter", "direction": "error", "provider": "groq",
"model": _resolved_model, "content": str(exc)})
return True, "exception"
async def _ai_call_ollama(prompt: str, model: str, base_url: str, _ctx: dict | None = None) -> tuple[bool, str]:
"""Call a local Ollama instance. Returns (match, reason).
_ctx: optional metadata dict injected into the debug log entry.
"""
url = f"{base_url.rstrip('/')}/api/generate"
_debug = _ai_debug_enabled()
_resolved_model = model or "llama3.2:3b"
if _debug:
_ai_debug_print("AI-DEBUG", f"OLLAMA REQUEST url={url} model={_resolved_model}", prompt)
_ai_log_entry({
"call_type": (_ctx or {}).get("call_type", "filter"),
"direction": "request",
"provider": "ollama",
"model": _resolved_model,
"content": prompt,
"title": (_ctx or {}).get("title"),
"site": (_ctx or {}).get("site"),
})
try:
async with httpx.AsyncClient(timeout=60) as client:
r = await client.post(
url,
json={"model": _resolved_model, "prompt": prompt, "stream": False},
)
if r.status_code == 200:
rj = r.json()
raw_text = rj.get("response", "").strip()
# Ollama eval_count ≈ completion tokens; prompt_eval_count ≈ prompt tokens
tok_p = rj.get("prompt_eval_count")
tok_c = rj.get("eval_count")
if _debug:
_ai_debug_print("AI-DEBUG", f"OLLAMA RESPONSE (tokens: prompt={tok_p} completion={tok_c})", raw_text)
match_v = raw_text.upper().startswith("YES")
_ai_log_entry({
"call_type": (_ctx or {}).get("call_type", "filter"),
"direction": "response",
"provider": "ollama",
"model": _resolved_model,
"content": raw_text,
"tokens_prompt": tok_p,
"tokens_completion": tok_c,
"verdict": "YES" if match_v else "NO",
"title": (_ctx or {}).get("title"),
"site": (_ctx or {}).get("site"),
})
match = raw_text.upper().startswith("YES")
reason = raw_text.split(":", 1)[1].strip() if ":" in raw_text else raw_text[:80]
return match, reason[:200]
else:
print(f"[AI] ❌ Ollama HTTP {r.status_code}: {r.text[:200]}")
if _debug:
_ai_debug_print("AI-DEBUG", f"OLLAMA ERROR RESPONSE ({r.status_code})", r.text)
_ai_log_entry({"call_type": "filter", "direction": "error", "provider": "ollama",
"model": _resolved_model, "content": r.text[:500], "status_code": r.status_code})
return True, f"ollama-error-{r.status_code}"
except httpx.ConnectError:
print(f"[AI] ❌ Ollama not reachable at {base_url} — is it running? Passing lot through.")
if _debug:
_ai_log_entry({"call_type": "filter", "direction": "error", "provider": "ollama",
"model": _resolved_model, "content": f"Connection refused at {base_url}"})
return True, "ollama-offline"
except Exception as exc:
print(f"[AI] ❌ Ollama call failed: {exc}")
if _debug:
_ai_log_entry({"call_type": "filter", "direction": "error", "provider": "ollama",
"model": _resolved_model, "content": str(exc)})
return True, "exception"
async def _ai_analyze(title: str, ai_target: str, description: str = "") -> tuple[bool, str]:
"""
Main AI dispatch. Reads provider from config each call so Settings
changes take effect without restart.
Returns (match: bool, reason: str).
If AI is misconfigured or errors, defaults to True (pass through)
so the scraper never silently drops listings.
description (N18): optional full lot text from detail page — improves
accuracy when lot title alone is vague.
"""
provider = _get_config("ai_provider", "groq").strip().lower()
if provider == "none":
return True, ""
prompt = _build_ai_prompt(title, ai_target, description)
model = _get_config("ai_model", "").strip()
_ctx = {"call_type": "filter", "title": title}
if provider == "groq":
api_key = _get_config("ai_api_key", "").strip()
if not model:
model = "llama-3.3-70b-versatile"
match, reason = await _ai_call_groq(prompt, api_key, model, _ctx=_ctx)
elif provider == "ollama":
base_url = _get_config("ai_base_url", "http://localhost:11434").strip()
if not model:
model = "llama3.2:3b"
match, reason = await _ai_call_ollama(prompt, model, base_url, _ctx=_ctx)
else:
return True, ""
verdict = "✅ YES" if match else "❌ NO"
print(f"[AI] {verdict}{title[:60]}{reason}")
return match, reason
# ─────────────────────────────────────────────────────────────────────────────
# ─────────────────────────────────────────────────────────────────────────────
# N17 — Auto-Adapter: AI-powered CSS selector generator
# Supports: Groq (free cloud) + Ollama (local unlimited)
# ─────────────────────────────────────────────────────────────────────────────
def _clean_html_for_ai(raw_html: str, max_chars: int = 14000) -> str:
"""
Strip everything that wastes tokens and confuses the AI:
- <script> and <style> blocks (content removed entirely)
- SVG blobs
- HTML comments
- Inline event handlers (on*)
- data: URIs (they are enormous)
- Excessive whitespace
Keeps class names, id attributes, aria-label, data-* attributes
because those are what CSS selectors are built from.
Then tries to isolate the main content area before truncating.
"""
# Remove block-level noise
raw_html = re.sub(r'<script[^>]*>.*?</script>', '', raw_html, flags=re.DOTALL | re.IGNORECASE)
raw_html = re.sub(r'<style[^>]*>.*?</style>', '', raw_html, flags=re.DOTALL | re.IGNORECASE)
raw_html = re.sub(r'<svg[^>]*>.*?</svg>', '<svg/>', raw_html, flags=re.DOTALL | re.IGNORECASE)
raw_html = re.sub(r'<!--.*?-->', '', raw_html, flags=re.DOTALL)
# Remove event handlers + data URIs
raw_html = re.sub(r'\s+on\w+="[^"]*"', '', raw_html)
raw_html = re.sub(r'\s+on\w+=\'[^\']*\'', '', raw_html)
raw_html = re.sub(r'(src|href|style)="data:[^"]*"', r'\1="data:..."', raw_html)
# Collapse whitespace
raw_html = re.sub(r'\s{2,}', ' ', raw_html).strip()
# Try to isolate the main content region (listings are usually here)
for main_pat in [
r'<main[^>]*>(.*?)</main>',
r'<\w+[^>]+role=["\']main["\'][^>]*>(.*?)</\w+>',
r'<\w+[^>]+id=["\'](?:content|main|results|listings|items|products)["\'][^>]*>(.*?)(?=<(?:footer|aside))',
r'<\w+[^>]+class=["\'][^"\']*(?:results|listings|items|products|catalog)[^"\']*["\'][^>]*>(.*?)(?=<(?:footer|aside))',
]:
m = re.search(main_pat, raw_html, re.DOTALL | re.IGNORECASE)
if m and len(m.group(1)) > 500:
return m.group(1)[:max_chars]
return raw_html[:max_chars]
def _build_selector_prompt(cleaned_html: str, site_name: str) -> str:
return f"""You are an expert web scraping engineer analyzing an auction/bidding website called "{site_name}".
Study the HTML below and find the CSS selectors for extracting auction listings.
RULES:
1. "container" = the CSS selector that matches EACH repeated listing card/item (must return 3+ elements)
2. "title_sel", "price_sel", "time_sel", "link_sel" = selectors RELATIVE to container (use querySelector inside the container)
3. For link_sel: if the container element itself is an <a> tag, write "self". Otherwise the CSS selector for the <a> inside the container.
4. For next_page_sel: the button or link to go to the next page. null if not found or not applicable.
5. If you cannot find a reliable selector for a field, use null.
6. Use the most specific and stable selector possible (prefer id > data-* attributes > class names).
7. Return ONLY a valid JSON object. No explanation, no markdown, no code blocks. Just the JSON.
REQUIRED JSON FORMAT:
{{"container": "...", "title_sel": "...", "price_sel": "...", "time_sel": "...", "link_sel": "...", "next_page_sel": "..."}}
HTML:
{cleaned_html}"""
async def _generate_selectors_ai(cleaned_html: str, site_name: str) -> dict | None:
"""
Send cleaned HTML to the configured AI provider and parse the returned
JSON selector map. Returns a dict with keys matching SiteSelectors fields,
or None on failure.
"""
provider = _get_config("ai_provider", "groq").strip().lower()
model = _get_config("ai_model", "").strip()
prompt = _build_selector_prompt(cleaned_html, site_name)
_debug = _ai_debug_enabled()
if _debug:
_ai_debug_print("AI-DEBUG", f"AUTO-ADAPT PROMPT site={site_name} provider={provider} model={model} html_chars={len(cleaned_html)}", prompt, max_chars=3000)
_ai_log_entry({
"call_type": "adapt",
"direction": "request",
"provider": provider,
"model": model,
"content": prompt,
"site": site_name,
})
raw_response = ""
if provider == "groq":
api_key = _get_config("ai_api_key", "").strip()
if not model: model = "llama-3.3-70b-versatile"
try:
async with httpx.AsyncClient(timeout=40) as client:
r = await client.post(
"https://api.groq.com/openai/v1/chat/completions",
headers={"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"},
json={"model": model, "messages": [{"role": "user", "content": prompt}],
"max_tokens": 500, "temperature": 0.0},
)
if r.status_code == 200:
rj = r.json()
raw_response = rj["choices"][0]["message"]["content"].strip()
usage = rj.get("usage", {})
tok_p = usage.get("prompt_tokens")
tok_c = usage.get("completion_tokens")
if _debug:
_ai_debug_print("AI-DEBUG", f"AUTO-ADAPT GROQ RESPONSE (tokens: prompt={tok_p} completion={tok_c})", raw_response)
_ai_log_entry({
"call_type": "adapt",
"direction": "response",
"provider": "groq",
"model": model,
"content": raw_response,
"tokens_prompt": tok_p,
"tokens_completion": tok_c,
"site": site_name,
})
else:
print(f"[AutoAdapt] ❌ Groq HTTP {r.status_code}: {r.text[:200]}")
if _debug:
_ai_debug_print("AI-DEBUG", f"AUTO-ADAPT GROQ ERROR ({r.status_code})", r.text)
_ai_log_entry({"call_type": "adapt", "direction": "error", "provider": "groq",
"model": model, "content": r.text[:500], "status_code": r.status_code, "site": site_name})
return None
except Exception as exc:
print(f"[AutoAdapt] ❌ Groq call failed: {exc}")
if _debug:
_ai_log_entry({"call_type": "adapt", "direction": "error", "provider": "groq",
"model": model, "content": str(exc), "site": site_name})
return None
elif provider == "ollama":
base_url = _get_config("ai_base_url", "http://localhost:11434").strip()
if not model: model = "llama3.2:3b"
try:
async with httpx.AsyncClient(timeout=120) as client:
r = await client.post(
f"{base_url.rstrip('/')}/api/generate",
json={"model": model, "prompt": prompt, "stream": False},
)
if r.status_code == 200:
rj = r.json()
raw_response = rj.get("response", "").strip()
tok_p = rj.get("prompt_eval_count")
tok_c = rj.get("eval_count")
if _debug:
_ai_debug_print("AI-DEBUG", f"AUTO-ADAPT OLLAMA RESPONSE (tokens: prompt={tok_p} completion={tok_c})", raw_response)
_ai_log_entry({
"call_type": "adapt",
"direction": "response",
"provider": "ollama",
"model": model,
"content": raw_response,
"tokens_prompt": tok_p,
"tokens_completion": tok_c,
"site": site_name,
})
else:
print(f"[AutoAdapt] ❌ Ollama HTTP {r.status_code}")
if _debug:
_ai_debug_print("AI-DEBUG", f"AUTO-ADAPT OLLAMA ERROR ({r.status_code})", r.text)
_ai_log_entry({"call_type": "adapt", "direction": "error", "provider": "ollama",
"model": model, "content": r.text[:500], "status_code": r.status_code, "site": site_name})
return None
except httpx.ConnectError:
print(f"[AutoAdapt] ❌ Ollama not reachable at {base_url}")
if _debug:
_ai_log_entry({"call_type": "adapt", "direction": "error", "provider": "ollama",
"model": model, "content": f"Connection refused at {base_url}", "site": site_name})
return None
except Exception as exc:
print(f"[AutoAdapt] ❌ Ollama call failed: {exc}")
if _debug:
_ai_log_entry({"call_type": "adapt", "direction": "error", "provider": "ollama",
"model": model, "content": str(exc), "site": site_name})
return None
else:
print("[AutoAdapt] ⚠️ No AI provider configured.")
return None
# Extract JSON from response — multi-strategy, most-to-least strict
def _extract_json(text: str):
# Strategy 1: direct parse (AI returned pure JSON)
try:
obj = json.loads(text)
if isinstance(obj, dict) and "container" in obj:
return obj
except Exception:
pass
# Strategy 2: strip markdown code fence (```json ... ``` or ``` ... ```)
stripped = re.sub(r'^```(?:json)?\s*', '', text.strip(), flags=re.IGNORECASE)
stripped = re.sub(r'\s*```$', '', stripped.strip())
try:
obj = json.loads(stripped)
if isinstance(obj, dict) and "container" in obj:
return obj
except Exception:
pass
# Strategy 3: find the outermost {...} block that contains "container"
brace_start = text.find('{')
if brace_start != -1:
depth, end = 0, -1
for i, ch in enumerate(text[brace_start:], brace_start):
if ch == '{': depth += 1
elif ch == '}':
depth -= 1
if depth == 0:
end = i + 1
break
if end != -1:
try:
obj = json.loads(text[brace_start:end])
if isinstance(obj, dict) and "container" in obj:
return obj
except Exception:
pass
# Strategy 4: regex fallback — any {...} block mentioning "container"
m = re.search(r'\{[^{}]{0,2000}"container"[^{}]{0,2000}\}', text, re.DOTALL)
if m:
try:
return json.loads(m.group())
except Exception:
pass
return None
data = _extract_json(raw_response)
if data is None:
print(f"[AutoAdapt] ❌ No JSON found in AI response: {raw_response[:300]}")
if _debug:
_ai_log_entry({"call_type": "adapt", "direction": "error", "provider": provider,
"model": model, "content": f"No JSON found in: {raw_response[:400]}", "site": site_name})
return None
print(f"[AutoAdapt] ✅ AI returned selectors: {data}")
return data
async def _validate_selectors(page, sel_dict: dict) -> tuple[float, int, float, float, str]:
"""
Test generated selectors live on the current page.
Returns (confidence 0-100, container_count, title_rate, price_rate, notes).
"""
container_sel = (sel_dict.get("container") or "").strip()
title_sel = (sel_dict.get("title_sel") or "").strip()
price_sel = (sel_dict.get("price_sel") or "").strip()
link_sel = (sel_dict.get("link_sel") or "").strip()
if not container_sel:
return 0.0, 0, 0.0, 0.0, "No container selector returned by AI"
try:
containers = await page.query_selector_all(container_sel)
except Exception as exc:
return 0.0, 0, 0.0, 0.0, f"Container selector error: {exc}"
count = len(containers)
if count < 2:
return 5.0, count, 0.0, 0.0, f"Container '{container_sel}' matched only {count} elements"
# Sample up to 10 containers for field testing
sample = containers[:min(10, count)]
title_hits = price_hits = link_hits = 0
for el in sample:
try:
if title_sel:
t_el = await el.query_selector(title_sel)
if t_el:
txt = (await t_el.inner_text()).strip()
if len(txt) > 3:
title_hits += 1
if price_sel:
p_el = await el.query_selector(price_sel)
if p_el:
txt = (await p_el.inner_text()).strip()
if any(c.isdigit() for c in txt):
price_hits += 1
if link_sel:
if link_sel == "self":
tag = await el.evaluate("e => e.tagName")
href = await el.evaluate("e => e.href || e.getAttribute('href') || ''")
if href:
link_hits += 1
else:
l_el = await el.query_selector(link_sel)
href = await l_el.evaluate("e => e.href || e.getAttribute('href') || ''") if l_el else ""
if href:
link_hits += 1
except Exception:
continue
n = len(sample)
title_rate = round(title_hits / n * 100, 1) if n else 0
price_rate = round(price_hits / n * 100, 1) if n else 0
link_rate = round(link_hits / n * 100, 1) if n else 0
# Confidence formula: weighted score
# Container count matters most, then title, link, price
count_score = min(40, (count / 20) * 40) # up to 40 pts for 20+ containers
title_score = (title_rate / 100) * 30 # up to 30 pts
link_score = (link_rate / 100) * 20 # up to 20 pts
price_score = (price_rate / 100) * 10 # up to 10 pts
confidence = round(count_score + title_score + link_score + price_score, 1)
notes = (
f"Container: {count} items | Title: {title_rate}% | "
f"Price: {price_rate}% | Link: {link_rate}%"
)
print(f"[AutoAdapt] 📊 Validation — {notes} → confidence {confidence}")
return confidence, count, title_rate, price_rate, notes
async def _extract_with_selectors(page, ss: "SiteSelectors") -> list[dict]:
"""
Use stored AI selectors to extract listing data from the current page.
Returns a list of row dicts compatible with the existing scrape pipeline.
Falls back to empty list on any error (caller then uses JS_EXTRACT fallback).
"""
try:
containers = await page.query_selector_all(ss.container_sel)
if not containers:
return []
rows = []
for el in containers[:60]:
try:
# Title
title = ""
if ss.title_sel:
t_el = await el.query_selector(ss.title_sel)
if t_el:
title = (await t_el.inner_text()).strip()
if not title:
title = (await el.inner_text()).strip()[:200]
# Price
price_text = ""
if ss.price_sel:
p_el = await el.query_selector(ss.price_sel)
if p_el:
price_text = (await p_el.inner_text()).strip()
# Time
time_text = ""
if ss.time_sel:
tm_el = await el.query_selector(ss.time_sel)
if tm_el:
time_text = (await tm_el.inner_text()).strip()
# Link
href = ""
if ss.link_sel == "self":
href = await el.evaluate("e => e.href || e.getAttribute('href') || ''")
elif ss.link_sel:
l_el = await el.query_selector(ss.link_sel)
if l_el:
href = await l_el.evaluate("e => e.href || e.getAttribute('href') || ''")
if not href:
# Last resort — find any <a> in container
l_el = await el.query_selector("a[href]")
if l_el:
href = await l_el.evaluate("e => e.href || ''")
if title and len(title) >= 5:
rows.append({"title": title, "price_text": price_text,
"time_text": time_text, "href": href})
except Exception:
continue
return rows
except Exception as exc:
print(f"[AutoAdapt] ⚠️ _extract_with_selectors failed: {exc}")
return []
async def _auto_dismiss_popups(page) -> bool:
"""
Attempt to auto-click cookie consent, GDPR banners, age-gate
and terms-of-service overlays so the underlying page content
is visible before HTML extraction.
Tries known consent-framework selectors first (fast + precise),
then falls back to visible buttons whose text matches common
consent phrases. Silent on failure — never raises.
Returns True if something was clicked.
"""
# ── Known consent-framework CSS selectors ─────────────────────────────────
KNOWN_SELS = [
"#onetrust-accept-btn-handler", # OneTrust (very widespread)
"#CybotCookiebotDialogBodyButtonAccept", # Cookiebot
"#cookieConsentAcceptButton",
"#cookie-notice-accept-button",
".cookie-accept-button",
".js-cookie-accept",
".cc-btn.cc-allow", # Cookie Consent (osano)
"[data-action='accept-cookies']",
"[data-cookiebanner='accept_button']",
"[aria-label*='accept' i]",
"[aria-label*='agree' i]",
"button#accept-all",
"button.accept-all",
"#gdpr-cookie-accept",
".gdpr-accept-btn",
"#age-gate-submit", # age gates
"button[data-testid='cookie-policy-dialog-accept-button']",
]
for sel in KNOWN_SELS:
try:
loc = page.locator(sel).first
if await loc.is_visible(timeout=400):
await loc.click(timeout=1000)
await asyncio.sleep(0.6)
return True
except Exception:
continue
# ── Text-based fallback for any visible button ─────────────────────────────
ACCEPT_TEXTS = [
"Accept all", "Accept All", "Accept Cookies", "Accept cookies",
"I Accept", "I accept", "I Agree", "I agree", "Agree",
"Accept", "Allow all", "Allow All", "Allow", "Got it",
"OK", "Ok", "Confirm", "Continue", "I understand",
"Dismiss", "Close", "I am 18+", "Enter site",
]
for text in ACCEPT_TEXTS:
try:
btn = page.get_by_role("button", name=text, exact=False).first
if await btn.is_visible(timeout=300):
await btn.click(timeout=1000)
await asyncio.sleep(0.6)
return True
except Exception:
continue
return False
async def adapt_site_now(site_id: int) -> dict:
"""
Full AI adaptation pipeline for one site.
Launches a temporary browser (reusing saved login profile if one exists
so that Cloudflare/session-gated sites work), auto-dismisses cookie/terms
popups, navigates using a test keyword, extracts + cleans HTML, calls AI
for selectors, validates them, stores result in SiteSelectors table.
Returns a status dict with confidence score and notes.
Called both from the API endpoint and from the self-healer.
Works with both Groq (online, best quality) and Ollama (local, unlimited).
"""
db = SessionLocal()
try:
site = db.query(TargetSite).filter(TargetSite.id == site_id).first()
if not site:
return {"error": "Site not found"}
kw_row = db.query(Keyword).filter(Keyword.term != "").first()
test_term = kw_row.term if kw_row else "laptop"
site_name = site.name
url_template = site.url_template
search_sel = site.search_selector or ""
finally:
db.close()
is_direct = "{keyword}" in url_template
visit_url = url_template.replace("{keyword}", test_term.replace(" ", "+")) if is_direct else url_template
print(f"[AutoAdapt] 🚀 Starting adaptation for '{site_name}'{visit_url}")
provider = _get_config("ai_provider", "groq").strip().lower()
if provider == "none":
return {"error": "AI provider is set to none — configure Groq or Ollama in Settings first"}
# Check for a saved browser profile (created by the 🔑 Login button).
# If one exists we use launch_persistent_context() so the saved Cloudflare
# cookies / login session carry over automatically — no manual steps needed.
_site_slug = re.sub(r"[^\w]", "_", site_name.lower())[:20]
_profile_dir = os.path.join(os.path.dirname(__file__), ".browser_profiles", _site_slug)
_use_profile = os.path.isdir(_profile_dir) and bool(os.listdir(_profile_dir))
if _use_profile:
print(f"[AutoAdapt] 🔐 Saved browser profile found — using session cookies for '{site_name}'")
else:
print(f"[AutoAdapt] 🌐 No saved profile — launching fresh browser for '{site_name}'")
_LAUNCH_ARGS = [
"--no-sandbox", "--disable-dev-shm-usage",
"--disable-blink-features=AutomationControlled",
"--disable-background-timer-throttling",
"--disable-renderer-backgrounding",
]
# ── CF check JS (reused in both headless and headful passes) ──────────────
_CF_JS = """() => !!(
document.querySelector('#challenge-stage') ||
document.querySelector('#cf-please-wait') ||
document.querySelector('.cf-browser-verification') ||
document.title.includes('Just a moment') ||
document.title.includes('Checking your browser')
)"""
sel_dict = None
confidence = count = 0
title_rate = price_rate = 0.0
notes = ""
_adapt_error = None
try:
async with async_playwright() as pw:
browser_label, exe = _resolve_browser()
profile = random.choice(_agent_profiles)
# Two-pass loop: headless first (fast, invisible), then headful if CF fires.
# CF is dramatically more lenient with visible browsers; the stealth patches
# plus a real on-screen window passes the check in the vast majority of cases.
# If headful ALSO gets CF and a CAPTCHA solver is configured, Turnstile is
# solved automatically via API — zero manual steps required.
for _headless in [True, False]:
if not _headless:
print(f"[AutoAdapt] 🖥️ Retrying '{site_name}' with visible browser "
f"(window will appear briefly then close automatically)...")
browser = None
ctx = None
# ── Launch browser ─────────────────────────────────────────────
if _use_profile:
_pctx_kw: dict = {
"headless": _headless,
"args": _LAUNCH_ARGS,
"user_agent": profile["ua"],
"viewport": {"width": profile["viewport"][0], "height": profile["viewport"][1]},
"locale": profile["locale"],
}
if exe: _pctx_kw["executable_path"] = exe
ctx = await pw.chromium.launch_persistent_context(_profile_dir, **_pctx_kw)
else:
_lk: dict = {"headless": _headless, "args": _LAUNCH_ARGS}
if exe: _lk["executable_path"] = exe
browser = await pw.chromium.launch(**_lk)
ctx = await browser.new_context(
user_agent=profile["ua"],
viewport={"width": profile["viewport"][0], "height": profile["viewport"][1]},
locale=profile["locale"],
)
await ctx.add_init_script(_build_stealth_script(profile))
page = await ctx.new_page()
await page.route("**/*.{png,jpg,jpeg,gif,webp,woff,woff2,ttf,svg,ico}", lambda r: r.abort())
try:
# ── Navigate ───────────────────────────────────────────────
await page.goto(visit_url, timeout=60_000, wait_until="domcontentloaded")
# Auto-dismiss cookie/terms popups before search box interaction
if await _auto_dismiss_popups(page):
print(f"[AutoAdapt] 🍪 Popup auto-dismissed for '{site_name}'")
# ── Cloudflare detection ───────────────────────────────────
_on_cf = await page.evaluate(_CF_JS)
if _on_cf:
if _headless:
# Headless blocked — retry non-headless automatically
print(f"[AutoAdapt] 🚧 CF detected (headless) — "
f"retrying with visible browser...")
continue # finally closes browser; next iter uses headless=False
# Headful also blocked — try Turnstile CAPTCHA solver
_solver = _get_config("captcha_solver", "none").lower()
_api_key = _get_config("captcha_api_key", "").strip()
if _solver in ("2captcha", "capsolver") and _api_key:
print(f"[AutoAdapt] 🔐 CF blocked visible browser — "
f"trying Turnstile solver ({_solver})...")
_ts_ok = await _solve_cf_turnstile(page, _solver, _api_key)
if _ts_ok:
# Wait for CF to redirect to the actual page
await page.wait_for_load_state("domcontentloaded", timeout=20_000)
await asyncio.sleep(1)
_on_cf = False # fall through to extraction below
else:
_adapt_error = (
"Cloudflare Turnstile could not be solved automatically. "
"Check CAPTCHA solver config and API key balance in Settings."
)
break
else:
_adapt_error = (
"Cloudflare blocked adapt in both headless and visible-browser modes. "
"For fully automatic solving: configure a CAPTCHA solver (2captcha or "
"CapSolver) in Settings → CAPTCHA. "
"Or: enable 'Requires Login' for this site and click 🔑 Login once "
"to save a session manually."
)
break
if not _on_cf:
# ── Mode B search ──────────────────────────────────────
if not is_direct:
try:
search_el = await _discover_search_input(page, search_sel, site_name)
await search_el.fill(test_term)
await search_el.press("Enter")
await page.wait_for_load_state("domcontentloaded", timeout=30_000)
await asyncio.sleep(2)
await _auto_dismiss_popups(page)
except Exception as se:
print(f"[AutoAdapt] ⚠️ Search interaction failed: {se} — using homepage HTML")
await asyncio.sleep(1.5) # let lazy-loaded content settle
raw_html = await page.content()
cleaned = _clean_html_for_ai(raw_html)
print(f"[AutoAdapt] 📄 Cleaned HTML: {len(cleaned)} chars → sending to {provider}")
sel_dict = await _generate_selectors_ai(cleaned, site_name)
if not sel_dict:
_adapt_error = "AI did not return usable selectors. Check provider config."
break
confidence, count, title_rate, price_rate, notes = \
await _validate_selectors(page, sel_dict)
break # ✅ success
finally:
if browser:
await browser.close()
elif ctx:
await ctx.close()
except Exception as exc:
print(f"[AutoAdapt] ❌ Browser error: {exc}")
return {"error": f"Browser launch failed: {exc}"}
if _adapt_error:
return {"error": _adapt_error}
if not sel_dict:
return {"error": "AI did not return usable selectors. Check provider config."}
# Persist to DB
db2 = SessionLocal()
try:
row = db2.query(SiteSelectors).filter(SiteSelectors.site_id == site_id).first()
if not row:
row = SiteSelectors(site_id=site_id)
db2.add(row)
row.container_sel = (sel_dict.get("container") or "")[:500]
row.title_sel = (sel_dict.get("title_sel") or "")[:500]
row.price_sel = (sel_dict.get("price_sel") or "")[:500]
row.time_sel = (sel_dict.get("time_sel") or "")[:500]
row.link_sel = (sel_dict.get("link_sel") or "")[:500]
row.next_page_sel = (sel_dict.get("next_page_sel") or "")[:500]
row.confidence = confidence
row.container_count = count
row.title_rate = title_rate
row.price_rate = price_rate
row.provider = provider
row.generated_at = datetime.now()
row.last_tested_at = datetime.now()
row.stale = False
row.notes = notes[:1000] if notes else ""
db2.flush()
db2.commit()
print(f"[AutoAdapt] ✅ Selectors saved for '{site_name}' — confidence {confidence}")
finally:
db2.close()
return {
"status": "done",
"site_name": site_name,
"confidence": confidence,
"container_count": count,
"title_rate": title_rate,
"price_rate": price_rate,
"notes": notes,
"selectors": sel_dict,
"provider": provider,
}
# ─────────────────────────────────────────────────────────────────────────────
def _extract_price_and_currency(text: str) -> tuple[Optional[float], str]:
"""
Parse a price string like "$45.00", "CA$1,200", "£700.50",
"45.00 USD", "EUR 120", "Current Bid: $50" etc.
Returns (float_amount, "ISO_CODE") or (None, "").
Handles comma-separated thousands (1,200.00) and
period-as-thousands (1.200,00) formats.
"""
if not text:
return None, ""
# Detect currency from the text
currency = ""
text_upper = text.upper()
for symbol, code in _CURRENCY_MAP:
if symbol.upper() in text_upper:
currency = code
break
# Normalise: remove currency symbols/codes and whitespace
clean = text
for symbol, _ in _CURRENCY_MAP:
clean = clean.replace(symbol, " ")
clean = clean.strip()
# Handle European format: 1.200,50 → 1200.50
if re.search(r"\d\.\d{3},\d{2}", clean):
clean = clean.replace(".", "").replace(",", ".")
# Strip commas used as thousands separators: 1,200.50 → 1200.50
clean = clean.replace(",", "")
# Find the first number (integer or decimal)
m = re.search(r"\d+\.?\d*", clean)
if m:
try:
return float(m.group()), currency
except ValueError:
pass
return None, currency
def _format_price(amount: Optional[float], currency: str) -> str:
"""
Return display string like "50.00 USD" or "700.50 GBP".
If amount is None returns "".
"""
if amount is None:
return ""
cur = currency.strip() if currency else "USD"
return f"{amount:,.2f} {cur}"
# ── N4 — Currency conversion cache ───────────────────────────────────────────
_fx_rates: dict[str, float] = {} # base=USD, e.g. {"GBP": 0.79, "EUR": 0.92}
_fx_fetched_at: float = 0.0 # epoch seconds
async def _get_fx_rates() -> dict[str, float]:
"""
Fetch daily exchange rates from frankfurter.app (free, no key needed).
Returns a dict of {currency_code: rate_per_1_USD}.
Cached for 6 hours to avoid hammering the API.
"""
global _fx_rates, _fx_fetched_at
if _fx_rates and (time.time() - _fx_fetched_at) < 21600: # 6h cache
return _fx_rates
try:
async with httpx.AsyncClient(timeout=10) as client:
r = await client.get("https://api.frankfurter.app/latest?from=USD")
if r.status_code == 200:
data = r.json()
rates = data.get("rates", {})
rates["USD"] = 1.0 # base
_fx_rates = rates
_fx_fetched_at = time.time()
print(f"[FX] ✅ Rates updated: {len(rates)} currencies.")
else:
print(f"[FX] ⚠️ frankfurter.app returned {r.status_code} — using cached rates.")
except Exception as exc:
print(f"[FX] ⚠️ Rate fetch failed: {exc} — using cached rates.")
return _fx_rates
def _convert_price(price: float, from_currency: str, to_currency: str) -> float | None:
"""
Convert a price from one currency to another using cached FX rates.
Returns None if conversion not possible (unknown currency or no rates loaded).
"""
if not price or not from_currency or not to_currency:
return None
from_c = from_currency.upper()
to_c = to_currency.upper()
if from_c == to_c:
return round(price, 2)
if not _fx_rates:
return None
# Convert: from_c → USD → to_c
rate_from = _fx_rates.get(from_c) # how many USD per 1 from_c unit
rate_to = _fx_rates.get(to_c) # how many USD per 1 to_c unit
if not rate_from or not rate_to:
return None
# price in from_c → price in USD → price in to_c
price_usd = price / rate_from
return round(price_usd * rate_to, 2)
def _extract_time_left(text: str) -> str:
"""
Parse a time-remaining string into a normalised "Xd Yh Zm" format.
Handles inputs like:
"2 days 4 hours 30 minutes"
"4h 30m"
"2d 04:30:00"
"Ends in 3 days"
"1 day left"
"Closing: 02:45" → hours:minutes only
"23:14:05" → HH:MM:SS
"Time Left: 0 days 2 hours 15 minutes"
Returns empty string if nothing parseable found.
"""
if not text:
return ""
t = text.lower()
days = 0
hours = 0
mins = 0
# "X day(s)"
m = re.search(r"(\d+)\s*d(?:ay)?s?", t)
if m:
days = int(m.group(1))
# "X hour(s)" or "Xh"
m = re.search(r"(\d+)\s*h(?:our)?s?", t)
if m:
hours = int(m.group(1))
# "X minute(s)" or "Xm"
m = re.search(r"(\d+)\s*m(?:in(?:ute)?s?)?(?!s)", t)
if m:
mins = int(m.group(1))
# HH:MM:SS or HH:MM (only if no days/hours/mins found yet)
if days == 0 and hours == 0 and mins == 0:
m = re.search(r"(\d{1,2}):(\d{2})(?::(\d{2}))?", t)
if m:
hours = int(m.group(1))
mins = int(m.group(2))
# seconds ignored
if days == 0 and hours == 0 and mins == 0:
return ""
parts = []
if days: parts.append(f"{days}d")
if hours: parts.append(f"{hours}h")
if mins: parts.append(f"{mins}m")
return " ".join(parts) if parts else ""
# ─────────────────────────────────────────────────────────────────────────────
# N2 — CAPTCHA Detection & Solver
# ─────────────────────────────────────────────────────────────────────────────
_CAPTCHA_INDICATORS = [
"captcha", "cf-challenge", "g-recaptcha", "h-captcha",
"datadome", "px-captcha", "challenge-form", "robot check",
"are you human", "verify you are human", "security check",
"just a moment", "checking your browser", # Cloudflare phrases
]
async def _detect_captcha(page) -> bool:
"""Return True if the current page appears to show a CAPTCHA challenge."""
try:
content = (await page.content()).lower()
title = (await page.title()).lower()
for ind in _CAPTCHA_INDICATORS:
if ind in content or ind in title:
return True
except Exception:
pass
return False
async def _solve_captcha_2captcha(page, api_key: str) -> bool:
"""
Attempt to solve a sitekey-based CAPTCHA (reCAPTCHA/hCaptcha) via 2captcha.
Injects the g-recaptcha-response token and submits the form.
Returns True if solved successfully.
"""
try:
# Extract sitekey from page
sitekey = await page.evaluate("""() => {
const el = document.querySelector('[data-sitekey]');
return el ? el.getAttribute('data-sitekey') : null;
}""")
if not sitekey:
print("[CAPTCHA] ⚠️ No sitekey found — cannot auto-solve.")
return False
page_url = page.url
# Submit task to 2captcha
async with httpx.AsyncClient(timeout=30) as client:
r = await client.post("https://2captcha.com/in.php", data={
"key": api_key, "method": "userrecaptcha",
"googlekey": sitekey, "pageurl": page_url, "json": 1,
})
data = r.json()
if data.get("status") != 1:
print(f"[CAPTCHA] ❌ 2captcha submit failed: {data}")
return False
task_id = data["request"]
# Poll for result (up to 120s)
for attempt in range(24):
await asyncio.sleep(5)
async with httpx.AsyncClient(timeout=15) as client:
r = await client.get("https://2captcha.com/res.php", params={
"key": api_key, "action": "get", "id": task_id, "json": 1,
})
data = r.json()
if data.get("status") == 1:
token = data["request"]
# Inject token into page
await page.evaluate(f"""(token) => {{
document.getElementById('g-recaptcha-response').value = token;
if (window.___grecaptcha_cfg) {{
const cbs = Object.values(window.___grecaptcha_cfg.clients || {{}});
cbs.forEach(c => {{ if (c.l && c.l.l) c.l.l(token); }});
}}
}}""", token)
await asyncio.sleep(1)
print("[CAPTCHA] ✅ 2captcha token injected.")
return True
if data.get("request") != "CAPCHA_NOT_READY":
print(f"[CAPTCHA] ❌ 2captcha error: {data}")
return False
print("[CAPTCHA] ❌ 2captcha timed out after 120s.")
return False
except Exception as exc:
print(f"[CAPTCHA] ❌ Solver error: {exc}")
return False
async def _solve_captcha_capsolver(page, api_key: str) -> bool:
"""CapSolver-flavoured solver — same flow, different endpoint."""
try:
sitekey = await page.evaluate("""() => {
const el = document.querySelector('[data-sitekey]');
return el ? el.getAttribute('data-sitekey') : null;
}""")
if not sitekey:
return False
async with httpx.AsyncClient(timeout=30) as client:
r = await client.post("https://api.capsolver.com/createTask", json={
"clientKey": api_key,
"task": {"type": "ReCaptchaV2TaskProxyLess",
"websiteURL": page.url, "websiteKey": sitekey},
})
data = r.json()
if data.get("errorId", 1) != 0:
print(f"[CAPTCHA] ❌ CapSolver create failed: {data}")
return False
task_id = data["taskId"]
for _ in range(24):
await asyncio.sleep(5)
async with httpx.AsyncClient(timeout=15) as client:
r = await client.post("https://api.capsolver.com/getTaskResult", json={
"clientKey": api_key, "taskId": task_id,
})
data = r.json()
if data.get("status") == "ready":
token = data["solution"]["gRecaptchaResponse"]
await page.evaluate(f"document.getElementById('g-recaptcha-response').value = arguments[0]", token)
print("[CAPTCHA] ✅ CapSolver token injected.")
return True
if data.get("status") == "failed":
print(f"[CAPTCHA] ❌ CapSolver failed: {data}")
return False
return False
except Exception as exc:
print(f"[CAPTCHA] ❌ CapSolver error: {exc}")
return False
async def _solve_cf_turnstile(page, solver: str, api_key: str) -> bool:
"""
Solve a Cloudflare Turnstile managed challenge using 2captcha or CapSolver.
Extracts the Turnstile sitekey from the CF page, submits to solver API,
injects the returned token into the hidden form field, and submits.
Returns True if challenge was submitted successfully (CF redirect follows).
"""
try:
sitekey = await page.evaluate("""() => {
const el = document.querySelector('[data-sitekey]');
if (el) return el.getAttribute('data-sitekey');
// Fallback: parse from inline script source on CF managed challenge page
const m = document.body.innerHTML.match(/sitekey[\"\\s:=]+[\"']([0-9a-zA-Z_\\-]{10,})/);
return m ? m[1] : null;
}""")
if not sitekey:
print("[AutoAdapt-CF] ⚠️ Turnstile sitekey not found — cannot auto-solve")
return False
page_url = page.url
token = None
if solver == "2captcha":
async with httpx.AsyncClient(timeout=30) as client:
r = await client.post("https://2captcha.com/in.php", data={
"key": api_key, "method": "turnstile",
"sitekey": sitekey, "pageurl": page_url, "json": 1,
})
data = r.json()
if data.get("status") != 1:
print(f"[AutoAdapt-CF] ❌ 2captcha Turnstile submit failed: {data}")
return False
task_id = data["request"]
for _ in range(24):
await asyncio.sleep(5)
async with httpx.AsyncClient(timeout=15) as client:
r = await client.get("https://2captcha.com/res.php", params={
"key": api_key, "action": "get", "id": task_id, "json": 1,
})
data = r.json()
if data.get("status") == 1:
token = data["request"]
break
if data.get("request") != "CAPCHA_NOT_READY":
print(f"[AutoAdapt-CF] ❌ 2captcha Turnstile error: {data}")
return False
elif solver == "capsolver":
async with httpx.AsyncClient(timeout=30) as client:
r = await client.post("https://api.capsolver.com/createTask", json={
"clientKey": api_key,
"task": {"type": "AntiTurnstileTaskProxyLess",
"websiteURL": page_url, "websiteKey": sitekey},
})
data = r.json()
if data.get("errorId", 1) != 0:
print(f"[AutoAdapt-CF] ❌ CapSolver Turnstile create failed: {data}")
return False
task_id = data["taskId"]
for _ in range(24):
await asyncio.sleep(5)
async with httpx.AsyncClient(timeout=15) as client:
r = await client.post("https://api.capsolver.com/getTaskResult", json={
"clientKey": api_key, "taskId": task_id,
})
data = r.json()
if data.get("status") == "ready":
token = data["solution"]["token"]
break
if data.get("status") == "failed":
print(f"[AutoAdapt-CF] ❌ CapSolver Turnstile failed: {data}")
return False
if not token:
print("[AutoAdapt-CF] ❌ Turnstile solver timed out after 120s")
return False
# Inject token into the hidden field and submit the CF challenge form
await page.evaluate("""(token) => {
const inp = document.querySelector('[name="cf-turnstile-response"]');
if (inp) inp.value = token;
const form = document.querySelector('#challenge-form') ||
document.querySelector('form[action]');
if (form) form.submit();
}""", token)
print("[AutoAdapt-CF] ✅ Turnstile token injected and challenge form submitted")
return True
except Exception as exc:
print(f"[AutoAdapt-CF] ❌ Turnstile solver error: {exc}")
return False
async def _handle_captcha(page, site_name: str) -> bool:
"""
Unified CAPTCHA handler. Reads solver preference from Config.
Returns True if solved (or no CAPTCHA), False if blocked.
"""
if not await _detect_captcha(page):
return True # no captcha — all clear
solver = _get_config("captcha_solver", "none").lower()
api_key = _get_config("captcha_api_key", "")
print(f"[CAPTCHA] 🔒 CAPTCHA detected on {site_name} — solver={solver}")
await send_alert(
f"🔒 <b>CAPTCHA detected</b> on <b>{site_name}</b>\n"
f"Solver: {solver}. {'Attempting auto-solve…' if solver != 'none' else '⚠️ No solver configured — set one in Settings.'}"
)
if solver == "none" or not api_key:
return False
if solver == "2captcha":
return await _solve_captcha_2captcha(page, api_key)
if solver == "capsolver":
return await _solve_captcha_capsolver(page, api_key)
return False
# ─────────────────────────────────────────────────────────────────────────────
# N3 — Block / Rate-limit Detection & Site Health Tracking
# ─────────────────────────────────────────────────────────────────────────────
_BLOCK_PHRASES = [
"access denied", "403 forbidden", "you have been blocked",
"your ip", "rate limit", "too many requests", "bot detected",
"automated access", "suspicious activity", "security check required",
"enable javascript", "please enable cookies",
]
async def _detect_block(page) -> bool:
"""Return True if the current page is a block/ban/rate-limit page."""
try:
status_code = getattr(page, "_last_status", None)
if status_code and status_code in (403, 429, 503):
return True
content = (await page.content()).lower()
title = (await page.title()).lower()
for phrase in _BLOCK_PHRASES:
if phrase in content or phrase in title:
return True
except Exception:
pass
return False
def _record_site_success(site_id: int) -> None:
"""Reset error counters and record last_success_at after a successful scrape."""
db = SessionLocal()
try:
s = db.query(TargetSite).filter(TargetSite.id == site_id).first()
if s:
s.consecutive_failures = 0
s.last_success_at = datetime.now()
s.cooldown_until = None
db.commit()
except Exception as exc:
print(f"[Health] ⚠️ Could not record success for site {site_id}: {exc}")
finally:
db.close()
def _record_site_error(site_id: int, error_msg: str) -> None:
"""
Increment error counters. If consecutive failures exceed the threshold,
put the site into cooldown and optionally disable it entirely.
Fires a Telegram alert on first block detection.
"""
db = SessionLocal()
try:
s = db.query(TargetSite).filter(TargetSite.id == site_id).first()
if not s:
return
s.last_error = error_msg[:500]
s.error_count = (s.error_count or 0) + 1
s.consecutive_failures = (s.consecutive_failures or 0) + 1
threshold = int(_get_config("site_auto_disable_after", "5"))
if s.consecutive_failures >= threshold and threshold > 0:
# Put into 30-minute cooldown
from datetime import timedelta
s.cooldown_until = datetime.now() + timedelta(minutes=30)
msg = (
f"⛔ <b>Site blocked/failing: {s.name}</b>\n"
f"Consecutive failures: {s.consecutive_failures}\n"
f"Last error: {error_msg[:200]}\n"
f"Site placed in 30-minute cooldown."
)
print(f"[Health] ⛔ {s.name} in cooldown after {s.consecutive_failures} failures.")
asyncio.create_task(send_alert(msg))
db.commit()
except Exception as exc:
print(f"[Health] ⚠️ Could not record error for site {site_id}: {exc}")
finally:
db.close()
async def _discover_search_input(page, css_fallback: str, site_name: str):
"""
Search Discovery — finds the search input on any page using Playwright's
semantic locators before falling back to the CSS selector stored in the DB.
Probe order (first match wins):
1. get_by_role("textbox", name="search") — ARIA role + accessible name
2. get_by_role("searchbox") — explicit ARIA searchbox role
3. get_by_placeholder("Search …") — common English placeholder variants
4. get_by_label("Search …") — <label> or aria-label variants
5. css_fallback — the site's stored search_selector
Each semantic probe has a 5-second timeout so a missing element fails fast
and the next strategy is tried immediately. The CSS fallback gets the full
15-second wait_for_selector treatment so slow navbars still load.
Returns the located Playwright Locator/ElementHandle on success, or raises
a RuntimeError with a descriptive message if every strategy fails (so the
caller can bail cleanly and log the exact reason).
"""
# ── Strategy table ────────────────────────────────────────────────────────
# Each entry is (label, coroutine_factory) where the factory is called with
# no args and returns an awaitable that resolves to the element or raises.
TIMEOUT_MS = 5_000 # per-strategy timeout for semantic probes
async def try_role_search():
loc = page.get_by_role("textbox", name=re.compile(r"search", re.I))
await loc.wait_for(timeout=TIMEOUT_MS)
return loc
async def try_role_searchbox():
loc = page.get_by_role("searchbox")
await loc.wait_for(timeout=TIMEOUT_MS)
return loc
async def try_placeholder():
# Cover "Search", "Search…", "Search for items", "Search items", etc.
loc = page.get_by_placeholder(re.compile(r"search", re.I))
await loc.wait_for(timeout=TIMEOUT_MS)
return loc
async def try_label():
loc = page.get_by_label(re.compile(r"search", re.I))
await loc.wait_for(timeout=TIMEOUT_MS)
return loc
strategies = [
("semantic:role[textbox+name=search]", try_role_search),
("semantic:role[searchbox]", try_role_searchbox),
("semantic:placeholder[search]", try_placeholder),
("semantic:label[search]", try_label),
]
for strategy_name, factory in strategies:
try:
element = await factory()
print(
f"[Scraper] 🔍 {site_name}: search input located via "
f"{strategy_name}"
)
return element
except Exception:
# Probe failed — try the next strategy silently
continue
# ── CSS fallback ──────────────────────────────────────────────────────────
if css_fallback:
try:
print(
f"[Scraper] 🔍 {site_name}: semantic probes exhausted, "
f"falling back to CSS selector '{css_fallback}'"
)
await page.wait_for_selector(css_fallback, timeout=15_000)
return page.locator(css_fallback)
except Exception as css_exc:
raise RuntimeError(
f"CSS fallback '{css_fallback}' also failed: {css_exc}"
) from css_exc
raise RuntimeError(
"All semantic probes failed and no CSS fallback was provided. "
"Add a search_selector for this site in the Target Sites tab."
)
# ─────────────────────────────────────────────────────────────────────────────
# N14 — Login Session Helper
# ─────────────────────────────────────────────────────────────────────────────
async def _check_login_status(page, site: TargetSite) -> bool:
"""
Returns True if the browser session appears to be logged into this site.
Detection strategy (in order):
1. If site.login_check_selector is set: element present = logged in.
2. If site.login_url is set: navigate to login page, check if we're
redirected away (already logged in) or stay on login page (not logged in).
3. If neither configured: assume logged in (don't block scraping).
"""
if not site.requires_login or not site.login_enabled:
return True # login not required for this site
try:
if site.login_check_selector and site.login_check_selector.strip():
el = await page.query_selector(site.login_check_selector)
logged_in = el is not None
if not logged_in:
print(f"[Login] ⚠️ {site.name}: not logged in (selector absent).")
return logged_in
if site.login_url and site.login_url.strip():
before = page.url
await page.goto(site.login_url, timeout=30_000, wait_until="domcontentloaded")
after = page.url
logged_in = after != site.login_url # redirected away = logged in
if not logged_in:
print(f"[Login] ⚠️ {site.name}: still on login page — not logged in.")
return logged_in
except Exception as exc:
print(f"[Login] ⚠️ Login check failed for {site.name}: {exc}")
return True # can't determine — proceed anyway
# ─────────────────────────────────────────────────────────────────────────────
# N5 — Pagination Helper
# ─────────────────────────────────────────────────────────────────────────────
_NEXT_PAGE_SELS = [
"a[aria-label='Next']",
"a[aria-label='Next page']",
"a[rel='next']",
"a.pagination-next",
"li.next > a",
"button[aria-label='Next page']",
".s-pagination-next", # eBay
"[data-testid='pagination-next']",
"a:has-text('Next')",
"a:has-text('')",
"a:has-text('»')",
]
async def _go_next_page(page) -> bool:
"""
Attempt to click the "Next page" button. Returns True if navigation succeeded.
Tries a list of common selectors across different auction platforms.
"""
for sel in _NEXT_PAGE_SELS:
try:
el = await page.query_selector(sel)
if el:
is_disabled = await el.get_attribute("aria-disabled") or \
await el.get_attribute("disabled") or ""
if "true" in str(is_disabled).lower() or "disabled" in str(is_disabled).lower():
continue
await el.click()
await page.wait_for_load_state("networkidle", timeout=30_000)
return True
except Exception:
continue
return False
# ── Page-level lot-card extractor (used by scrape_site + pagination) ──────────
# Returns a list of {title, price_text, time_text, location, href, images[]}
# Extracted once and reused so the pagination path shares the same logic.
JS_EXTRACT = """() => {
// ─────────────────────────────────────────────────────────────────────────
// CARD-ANCHORED EXTRACTION
// Strategy: find each listing's "card" container first, then look for
// title / price / time / link / images INSIDE that same card.
// For Angular/React sites where price+time live outside the card (e.g. HiBid),
// we walk UP through ancestors until we reach a container that holds all
// four elements together.
// ─────────────────────────────────────────────────────────────────────────
// ── 1. Find card containers ───────────────────────────────────────────────
const CARD_SELS = [
'li.s-item', // eBay
'div.lot-card', // ShopGoodwill
'.item-card',
'article[class*="lot"]',
'article[class*="item"]',
'div[class*="lot-item"]',
'div[class*="lot-card"]',
'div[class*="listing-item"]',
'div[class*="result-item"]',
'[data-listing-id]',
'[data-lot-id]',
'[data-item-id]',
];
let cards = [];
for (const s of CARD_SELS) {
try {
const els = document.querySelectorAll(s);
if (els.length >= 2) { cards = Array.from(els); break; }
} catch(e) {}
}
// ── 2. Helpers ────────────────────────────────────────────────────────────
// Query inside a root, trying selectors in order
const qIn = (root, sels) => {
for (const s of sels) {
try { const el = root.querySelector(s); if (el) return el; }
catch(e) {}
}
return null;
};
// ── Apollo Client cache helper (HiBid & other GraphQL SPAs) ──────────────
// Builds a map of lotId → [imageUrl, ...] from the Apollo in-memory cache.
// Works on both search-results pages and detail pages.
// Returns empty map if Apollo is not present.
const _apolloImgMap = (() => {
const map = {};
try {
if (!window.__APOLLO_CLIENT__) return map;
const cache = window.__APOLLO_CLIENT__.cache.extract();
for (const cacheKey of Object.keys(cache)) {
// Keys like "Lot:289880823" or "Item:12345"
const m = cacheKey.match(/^(?:Lot|Item|AuctionLot|Product):(\\d+)$/);
if (!m) continue;
const id = m[1];
const entry = cache[cacheKey];
const urls = [];
// HiBid: pictures[].fullSizeLocation / hdThumbnailLocation
const pics = entry.pictures || entry.images || entry.photos || [];
if (Array.isArray(pics)) {
for (const p of pics) {
const u = typeof p === 'string' ? p :
(p.fullSizeLocation || p.hdThumbnailLocation ||
p.thumbnailLocation || p.url || p.src || p.imageUrl || '');
if (u && u.startsWith('http')) urls.push(u);
}
}
// Also check featuredPicture
const fp = entry.featuredPicture;
if (fp && typeof fp === 'object') {
const u = fp.fullSizeLocation || fp.hdThumbnailLocation || fp.url || '';
if (u && u.startsWith('http') && !urls.includes(u)) urls.push(u);
}
if (urls.length) map[id] = urls;
}
} catch(e) {}
return map;
})();
// Get images for a given href via Apollo cache (HiBid: /lot/{id}/...)
const apolloImagesForHref = (href) => {
try {
const m = (href || '').match(/\\/(?:lot|item|product)\\/(\\d+)/i);
if (m && _apolloImgMap[m[1]]) return _apolloImgMap[m[1]];
} catch(e) {}
return null;
};
// Get text from element, with label stripping
const priceText = el => {
if (!el) return '';
let t = (el.innerText || el.getAttribute('data-price') ||
el.getAttribute('content') || '').trim();
// Multi-line: take first line with a digit
if (t.includes('\\n')) {
const ln = t.split('\\n').find(x => /[0-9]/.test(x));
if (ln) t = ln.trim();
}
// Strip label prefix: "Current Bid: $45""$45"
t = t.replace(/^[^[0-9]$£€¥₹CA]*(?=[$£€¥₹[0-9]])/, '').trim();
return t;
};
// Extract image URLs from a container element (up to 5, no tiny icons)
// Apollo href override: if a lot href is provided, tries Apollo cache first.
const extractImages = (root, href) => {
// Apollo cache takes priority — full gallery, no DOM scanning needed
const apolloUrls = apolloImagesForHref(href);
if (apolloUrls && apolloUrls.length) return apolloUrls.slice(0, 10);
const urls = [];
if (!root) return urls;
const imgs = root.querySelectorAll('img');
for (const img of Array.from(imgs).slice(0, 10)) {
const src = img.getAttribute('data-src') || img.getAttribute('data-lazy-src') ||
img.getAttribute('data-original') || img.getAttribute('data-lazy') ||
img.src || '';
if (!src || !src.startsWith('http') || src.length < 20) continue;
// Skip tiny icons / trackers (check rendered or attribute dimensions)
const w = img.naturalWidth || parseInt(img.getAttribute('width') || '0') || 0;
const h = img.naturalHeight || parseInt(img.getAttribute('height') || '0') || 0;
if (w > 0 && w < 40) continue;
if (h > 0 && h < 40) continue;
if (!urls.includes(src)) urls.push(src);
if (urls.length >= 5) break;
}
return urls;
};
const TITLE_SELS = [
'.s-item__title',
'h2.lot-title', 'h3.lot-title',
'h2[class*="lot"]', 'h3[class*="lot"]',
'h2[class*="title"]', 'h3[class*="title"]',
'.lot-title', '.item-title', '.listing-title',
'.product-title', '.card-title',
'h2', 'h3',
];
const PRICE_SELS = [
'.s-item__price',
'[class*="current-bid"] [class*="amount"]',
'[class*="current-bid"]',
'[class*="bid-amount"]', '[class*="bid-price"]',
'[class*="currentBid"]',
'[itemprop="price"]',
'[class*="price-value"]', '[class*="price_value"]',
'span[class*="price"]',
'.price', '[class*="price"]',
];
const TIME_SELS = [
'.s-item__time-left', '.s-item__time-end',
'[class*="time-left"]', '[class*="timeleft"]',
'[class*="countdown"]', '[class*="closing-time"]',
'[class*="closingTime"]', '[class*="time-remaining"]',
'[class*="ends-in"]', '[class*="lot-end"]',
'[class*="auction-end"]', '[class*="expire"]',
'[class*="end-time"]', 'time',
];
const LINK_SELS = [
"a[href*='/lot/']", "a[href*='/item/']",
"a[href*='/itm/']", "a[href*='/listing/']",
"a[href*='/product/']", "a[href*='/auction/']", "a",
];
// ── 3. Per-card extraction ─────────────────────────────────────────────────
const rows = [];
if (cards.length >= 2) {
// NORMAL PATH: each card contains all its own data
for (const card of cards.slice(0, 30)) {
const titleEl = qIn(card, TITLE_SELS);
const title = titleEl ? titleEl.innerText.trim() : '';
if (!title || title.length < 4) continue;
const pt = priceText(qIn(card, PRICE_SELS));
const tt = (qIn(card, TIME_SELS) || {innerText:''}).innerText.trim();
// Location
const LOC_SELS = [
'[data-location]','[class*="location"]','[class*="country"]','[class*="city"]',
'.item-location','.lot-location','.seller-location',
'span[itemprop="addressLocality"]','span[itemprop="addressCountry"]',
];
let locEl = qIn(card, LOC_SELS);
let location = locEl ? locEl.textContent.trim().replace(/\\s+/g,' ').slice(0,80) : '';
// Link: card itself → title's closest <a> → first link sel inside card
let href = card.tagName === 'A' ? card.href : '';
if (!href && titleEl) {
const a = titleEl.closest('a') || titleEl.querySelector('a');
if (a) href = a.href;
}
if (!href) {
const a = qIn(card, LINK_SELS);
if (a) href = a.href;
}
// Images — Apollo cache first, then DOM fallback
const images = extractImages(card, href);
rows.push({ title, price_text: pt, time_text: tt, location, href: href || '', images });
}
}
// ── 4. FALLBACK: no card containers found (Angular SPA etc.) ───────────────
// Find all title elements, then for each one walk UP the DOM to
// find the smallest ancestor that also contains a price element.
// That ancestor is the logical "card" for this title.
if (rows.length === 0) {
const titleEls = [];
for (const s of TITLE_SELS) {
try {
const found = document.querySelectorAll(s);
if (found.length >= 2) {
titleEls.push(...Array.from(found));
break;
}
} catch(e) {}
}
for (const titleEl of titleEls.slice(0, 30)) {
const title = titleEl.innerText.trim();
if (!title || title.length < 4) continue;
// Walk up max 8 levels to find a container with price/time
let container = titleEl.parentElement;
let pt = '', tt = '', location = '', href = '';
for (let depth = 0; depth < 8 && container; depth++) {
pt = priceText(qIn(container, PRICE_SELS));
tt = (qIn(container, TIME_SELS) || {innerText:''}).innerText.trim();
if (pt || depth >= 5) break; // found or deep enough
container = container.parentElement;
}
// Location from container
if (container) {
const LOC_SELS = [
'[data-location]','[class*="location"]','[class*="country"]','[class*="city"]',
'.item-location','.lot-location','.seller-location',
'span[itemprop="addressLocality"]','span[itemprop="addressCountry"]',
];
let locEl = qIn(container, LOC_SELS);
location = locEl ? locEl.textContent.trim().replace(/\\s+/g,' ').slice(0,80) : '';
}
// Link from container
if (container) {
const a = container.tagName === 'A' ? container :
titleEl.closest('a') || qIn(container, LINK_SELS);
if (a) href = a.href || '';
}
// Images — Apollo cache first (uses href), then DOM fallback
const images = extractImages(container, href);
rows.push({ title, price_text: pt, time_text: tt, location, href, images });
}
}
return rows;
}"""
# ── JS_DETAIL_IMAGES ─────────────────────────────────────────────────────────
# Runs on a LOT DETAIL page (not search results). 5-layer image extraction:
# Layer 0 — Apollo Client cache (HiBid / GraphQL SPAs)
# Layer 1 — JSON-LD structured data
# Layer 2 — Open Graph meta tags
# Layer 3 — DOM <img> elements (all data-* attrs)
# Layer 4 — <picture><source> srcset
# Returns plain array of image URL strings (up to 10).
JS_DETAIL_IMAGES = r"""() => {
const seen = new Set();
const imgUrls = [];
const addUrl = (src) => {
if (!src || typeof src !== 'string') return false;
src = src.trim();
const lc = src.toLowerCase();
if (!src.startsWith('http') || src.length < 20) return false;
if (lc.includes('1x1') || lc.includes('pixel.gif') ||
lc.includes('tracking') || lc.includes('beacon')) return false;
if (seen.has(src)) return false;
seen.add(src);
imgUrls.push(src);
return true;
};
// Layer 0: Apollo Client cache (HiBid & GraphQL SPAs)
try {
if (window.__APOLLO_CLIENT__) {
const ac = window.__APOLLO_CLIENT__.cache.extract();
const lotIdMatch = location.href.match(/\/(?:lot|item|product)\/(\d+)/i);
const lotId = lotIdMatch ? lotIdMatch[1] : null;
const keys = lotId ? ['Lot:'+lotId, 'Item:'+lotId] :
Object.keys(ac).filter(k => /^(Lot|Item|AuctionLot|Product):/.test(k));
for (const ck of keys) {
const e = ac[ck]; if (!e) continue;
const pics = e.pictures || e.images || e.photos || [];
if (Array.isArray(pics)) {
for (const p of pics) {
const u = typeof p === 'string' ? p :
(p.fullSizeLocation || p.hdThumbnailLocation ||
p.thumbnailLocation || p.url || p.src || '');
addUrl(u);
}
}
const fp = e.featuredPicture;
if (fp && typeof fp === 'object')
addUrl(fp.fullSizeLocation || fp.hdThumbnailLocation || fp.url || '');
}
}
} catch(e) {}
// Layer 1: JSON-LD structured data
try {
for (const s of document.querySelectorAll('script[type="application/ld+json"]')) {
try {
const parsed = JSON.parse(s.textContent || '');
for (const node of (Array.isArray(parsed) ? parsed : [parsed])) {
const imgs = node.image || node.photo || [];
for (const img of (Array.isArray(imgs) ? imgs : [imgs])) {
const u = typeof img === 'string' ? img :
(img.url || img.contentUrl || img['@id'] || '');
addUrl(u);
}
}
} catch(e) {}
}
} catch(e) {}
// Layer 2: Open Graph meta tags
if (imgUrls.length < 2) {
try {
for (const m of document.querySelectorAll(
'meta[property="og:image"], meta[name="og:image"], ' +
'meta[property="og:image:url"], meta[itemprop="image"]'))
addUrl(m.getAttribute('content') || m.getAttribute('href') || '');
} catch(e) {}
}
// Layer 3: DOM img elements
if (imgUrls.length < 5) {
try {
for (const img of document.querySelectorAll('img')) {
if (imgUrls.length >= 10) break;
let picked = '';
for (const attr of img.attributes) {
const v = attr.value || '';
if ((attr.name.startsWith('data-') || attr.name === 'src') &&
v.startsWith('http') && v.length > 20 &&
/\.(jpe?g|png|webp|gif)(\?|$)/i.test(v)) { picked = v; break; }
}
if (!picked) picked = img.src || '';
if (!picked.startsWith('http')) continue;
const w = parseInt(img.getAttribute('width') || '0') || 0;
const h = parseInt(img.getAttribute('height') || '0') || 0;
if ((w > 0 && w < 50) || (h > 0 && h < 50)) continue;
addUrl(picked);
}
} catch(e) {}
}
// Layer 4: <picture><source> srcset
if (imgUrls.length < 5) {
try {
for (const s of document.querySelectorAll('picture source[srcset], source[srcset]')) {
if (imgUrls.length >= 10) break;
const parts = (s.getAttribute('srcset') || '').split(',')
.map(p => p.trim().split(/\s+/)[0]).filter(Boolean);
addUrl(parts[parts.length - 1] || '');
}
} catch(e) {}
}
return imgUrls.slice(0, 10);
}"""
# ── JS_DETAIL_WAIT — Apollo cache readiness check (passed to wait_for_function)
JS_APOLLO_WAIT = """(lotId) => {
try {
if (!window.__APOLLO_CLIENT__) return false;
const c = window.__APOLLO_CLIENT__.cache.extract();
if (lotId) {
for (const prefix of ['Lot','Item','AuctionLot','Product']) {
const entry = c[prefix + ':' + lotId];
if (entry && Array.isArray(entry.pictures) && entry.pictures.length > 0)
return true;
}
return false;
}
return Object.keys(c).some(k => /^(Lot|Item|AuctionLot|Product):\\d/.test(k));
} catch(e) { return false; }
}"""
# ── JS_DETAIL_TEXT ───────────────────────────────────────────────────────────
# Runs on a LOT DETAIL page. Extracts the lot description text using a
# priority-ordered strategy:
# 1. JSON-LD "description" field (cleanest, structured)
# 2. Open Graph meta description
# 3. Known auction-site description selectors (common CSS patterns)
# 4. Largest <p>/<div> block in main content area (fallback)
# Returns a plain string, max 1500 chars (enough for AI, cheap on tokens).
JS_DETAIL_TEXT = r"""() => {
const MAX = 1500;
function clean(s) {
return (s || '').replace(/\s+/g, ' ').trim().slice(0, MAX);
}
// 1. JSON-LD
try {
const scripts = document.querySelectorAll('script[type="application/ld+json"]');
for (const s of scripts) {
const d = JSON.parse(s.textContent || '{}');
const desc = d.description || (Array.isArray(d['@graph']) && d['@graph'].find(x => x.description)?.description);
if (desc && desc.length > 20) return clean(desc);
}
} catch(e) {}
// 2. OG meta
const og = document.querySelector('meta[property="og:description"]');
if (og && og.content && og.content.length > 20) return clean(og.content);
// 3. Known selectors (eBay, HiBid, ShopGoodwill, generic)
const sel = [
'[data-testid="x-item-description"] iframe', // eBay (inner frame — skipped gracefully)
'.item-description', '.lot-description', '.description-text',
'#desc_div', '#ItemDescription', '#item-description',
'[itemprop="description"]', '.product-description',
'.auction-description', '.lot-details', '.listing-description',
];
for (const s of sel) {
const el = document.querySelector(s);
if (el && el.innerText && el.innerText.trim().length > 20)
return clean(el.innerText);
}
// 4. Largest text block in main content
const candidates = Array.from(document.querySelectorAll('main p, article p, [role="main"] p, .content p, #content p'));
if (candidates.length) {
const longest = candidates.reduce((a, b) => a.innerText.length > b.innerText.length ? a : b);
if (longest.innerText.trim().length > 40) return clean(longest.innerText);
}
return '';
}"""
async def _fetch_listing_images_batch(page_context, new_links: list, db) -> int:
"""
Visit each new listing's detail page to extract full image gallery
AND lot description text (N18).
Called immediately after initial scrape so images and descriptions are
available right away instead of waiting for the 5-minute price refresh.
Args:
page_context: Playwright BrowserContext (from page.context)
new_links: list of (listing_id, link) tuples
db: SQLAlchemy session
Returns:
Number of listings whose images were updated.
"""
updated = 0
for listing_id, link in new_links:
if not link or link.startswith("no-link"):
continue
try:
dp = await page_context.new_page()
await dp.route(
"**/*.{png,jpg,jpeg,gif,svg,woff,woff2,ttf,mp4,webp}",
lambda route: route.abort(),
)
await dp.goto(link, timeout=25_000, wait_until="domcontentloaded")
# Smart wait — Apollo cache polling for GraphQL SPAs
_lot_id_m = re.search(r'/(?:lot|item|product)/(\d+)', link, re.IGNORECASE)
_lot_id = _lot_id_m.group(1) if _lot_id_m else None
try:
await dp.wait_for_function(JS_APOLLO_WAIT, arg=_lot_id,
timeout=8000, polling=200)
except Exception:
pass
await dp.wait_for_timeout(1200)
img_urls = await dp.evaluate(JS_DETAIL_IMAGES)
desc_text = await dp.evaluate(JS_DETAIL_TEXT)
await dp.close()
# Save images and/or description — at least one must be present to write.
if img_urls or desc_text:
listing = db.query(Listing).filter(Listing.id == listing_id).first()
if listing:
if img_urls:
listing.images = json.dumps(img_urls[:10])
if desc_text:
listing.description = desc_text[:1500]
# N18: Re-run AI with description for better accuracy.
# Only re-analyse listings that passed on title alone (ai_match=1)
# and have a keyword ai_target — rejected lots stay rejected.
_ai_en = _get_config("ai_filter_enabled", "false").lower() == "true"
if _ai_en and listing.ai_match == 1 and listing.keyword:
try:
_kw = db.query(Keyword).filter(Keyword.term == listing.keyword).first()
_tgt = (_kw.ai_target or "").strip() if _kw else ""
if _tgt:
_match2, _reason2 = await _ai_analyze(listing.title, _tgt, desc_text)
listing.ai_match = 1 if _match2 else 0
listing.ai_reason = (_reason2 or "")[:200]
print(f"[AI+Desc] {'' if _match2 else ''} {listing.title[:40]}{_reason2}")
except Exception as _ae:
print(f"[AI+Desc] ⚠️ re-analysis failed: {_ae}")
db.flush()
db.commit()
updated += 1
_img_n = len(img_urls) if img_urls else 0
_desc_n = len(desc_text) if desc_text else 0
print(f"[Detail] 🖼️ {listing.title[:35]}{_img_n} img(s), {_desc_n} desc chars")
except Exception as exc:
print(f"[Images] ⚠️ {link[:55]}: {exc}")
try:
await dp.close()
except Exception:
pass
continue
return updated
async def scrape_site(
page,
site: TargetSite,
keyword: Keyword,
db: Session,
delay_post_search: int = 0,
delay_page_hold: int = 0,
delay_site_open: int = 0,
is_first_keyword: bool = False,
humanize_level: str = "heavy", # raw | low | medium | heavy
round_item_id: int | None = None,
) -> dict:
"""
Navigate to a target site and scrape results for one keyword.
Navigation mode is determined automatically by whether url_template
contains the literal string '{keyword}':
Mode A — Direct Template ({keyword} present)
The URL is built by substituting {keyword} and navigated to directly.
This is the fast path for sites with stable search-result URLs
(eBay, Amazon, etc.). If a search_selector is also provided it is
used only as a fallback when the direct navigation fails.
Mode B — Homepage Search (no {keyword} in url_template)
The bot navigates to url_template as a landing page, then runs
Search Discovery to locate the search input automatically using
ARIA roles, placeholder text, and label associations before
falling back to search_selector if one is stored.
search_selector is optional — leave it blank and Ghost Node will
auto-discover the search box on any 2026-era website.
Returns:
{
status: "done" | "pending",
new_count: int,
pending_reason: str | None
}
"""
new_count = 0
pending_reason: str | None = None
def _update_round_item(_status: str, _reason: str | None) -> None:
"""Best-effort update of a scrape_round_items row for this keyword attempt."""
if round_item_id is None:
return
try:
ri = db.query(ScrapeRoundItem).filter(ScrapeRoundItem.id == round_item_id).first()
if not ri:
return
ri.status = _status
ri.last_error = _reason
ri.last_attempt_at = datetime.now()
if _status == "pending" and not ri.first_pending_at:
ri.first_pending_at = datetime.now()
db.flush()
db.commit()
except Exception as _e:
print(f"[RoundItems] ⚠️ Could not update round item id={round_item_id}: {_e}")
_new_listing_links = [] # (listing_id, link) — for detail-page image fetch
is_direct_mode = "{keyword}" in site.url_template
# ── N14: Login guard — check BEFORE any navigation ───────────────────────
# If the site requires login and the feature is enabled, verify the
# browser session is authenticated. If not logged in, skip this site
# and Telegram-alert the user to use the 🔑 Login button in the dashboard.
if site.requires_login and site.login_enabled:
already_logged_in = await _check_login_status(page, site)
if not already_logged_in:
msg = (
f"🔑 <b>Login required — {site.name}</b>\n"
f"Ghost Node is not logged into this site.\n"
f"Open the Dashboard → Sites tab and press the 🔑 <b>Login</b> button "
f"next to <i>{site.name}</i>, then log in manually.\n"
f"Scraping this site is paused until you log in."
)
asyncio.create_task(send_alert(msg, subject=f"Login required — {site.name}"))
print(f"[Login] ⛔ {site.name}: not logged in — skipping. Use dashboard 🔑 Login button.")
pending_reason = "login required (not logged in)"
_update_round_item("pending", pending_reason)
return {"status": "pending", "new_count": new_count, "pending_reason": pending_reason}
try:
# ── MODE A: Direct template navigation ───────────────────────────────
if is_direct_mode:
target_url = site.url_template.replace(
"{keyword}", keyword.term.replace(" ", "+")
)
print(
f"[Scraper] MODE=DIRECT {site.name} | "
f"'{keyword.term}'{target_url}"
)
# ── Homepage pre-visit (medium / heavy only) ─────────────────────
# Raw/low jump straight to the results URL.
# Medium/heavy visit the homepage first for a natural referer chain.
if is_first_keyword and humanize_level.strip().lower() in ("medium", "heavy"):
try:
# Derive homepage from the template URL
from urllib.parse import urlparse as _uparse
_parsed = _uparse(site.url_template)
_homepage = f"{_parsed.scheme}://{_parsed.netloc}/"
print(f"[Scraper] 🏠 Pre-visiting homepage: {_homepage}")
await page.goto(_homepage, timeout=30_000,
wait_until="domcontentloaded",
referer="https://www.google.com/")
# Human idle on homepage — as if glancing at the front page
await asyncio.sleep(_jitter(2.5, pct=0.5))
await _human_mouse(page)
await asyncio.sleep(_jitter(1.2, pct=0.4))
await _human_scroll(page, steps=random.randint(1, 3))
await asyncio.sleep(_jitter(1.0, pct=0.4))
except Exception as hp_exc:
print(f"[Scraper] ⚠️ Homepage pre-visit skipped: {hp_exc}")
try:
await page.goto(target_url, timeout=60_000, wait_until="networkidle",
referer=f"https://{_parsed.netloc}/" if is_first_keyword else "https://www.google.com/")
except Exception as goto_exc:
print(
f"[Scraper] ⚠️ networkidle timeout for {site.name}, "
f"retrying with domcontentloaded: {goto_exc}"
)
await page.goto(
target_url, timeout=60_000, wait_until="domcontentloaded"
)
# ── N2: CAPTCHA check after navigation ────────────────────────────
if await _detect_captcha(page):
print(f"[CAPTCHA] 🤖 Detected on {site.name} — attempting solve…")
_solver = _get_config("captcha_solver", "none").strip().lower()
_api_key = _get_config("captcha_api_key", "").strip()
_solved = False
if _solver == "2captcha" and _api_key:
_solved = await _solve_captcha_2captcha(page, _api_key)
elif _solver == "capsolver" and _api_key:
_solved = await _solve_captcha_capsolver(page, _api_key)
if not _solved:
print(f"[CAPTCHA] ❌ Could not solve CAPTCHA on {site.name} — skipping.")
_record_site_error(site.id, "CAPTCHA not solved")
pending_reason = "captcha not solved"
_update_round_item("pending", pending_reason)
return {"status": "pending", "new_count": new_count, "pending_reason": pending_reason}
# ── N3: Block detection after navigation ──────────────────────────
if await _detect_block(page):
print(f"[Block] 🚫 {site.name} appears to be blocking us.")
_record_site_error(site.id, "Block/rate-limit detected")
pending_reason = "block/rate-limit detected"
_update_round_item("pending", pending_reason)
return {"status": "pending", "new_count": new_count, "pending_reason": pending_reason}
# ── Website-launch delay (Mode A, first keyword only) ────────────
# Page has just opened. Delay fires here — before any scraping.
# Subsequent keywords on the same site skip this (is_first_keyword=False).
if is_first_keyword and delay_site_open > 0:
print(
f"[Scraper] 🌐 Website-launch delay: {delay_site_open}s "
f"{site.name} opened, holding before scraping "
f"({keyword.term})"
)
await asyncio.sleep(delay_site_open)
print(f"[Scraper] ✅ Website-launch delay done")
# ── MODE B: Homepage search interaction ──────────────────────────────
else:
# search_selector is optional — _discover_search_input will try
# four semantic strategies (ARIA role, searchbox, placeholder, label)
# before falling back to the CSS selector. An empty selector simply
# means all four semantic strategies run with no CSS safety net;
# if they all fail, _discover_search_input raises RuntimeError and
# the except below logs it clearly.
sel = (site.search_selector or "").strip()
print(
f"[Scraper] MODE=HOMEPAGE {site.name} | "
f"'{keyword.term}' via {site.url_template}"
+ (f" selector='{sel}'" if sel else " (auto-discover search box)")
)
# Step 1 — land on the homepage
await page.goto(
site.url_template, timeout=60_000, wait_until="domcontentloaded"
)
# ── Website-launch delay (Mode B, first keyword only) ────────────
# Homepage has just opened — delay fires here, before the search
# box is touched. Subsequent keywords skip this entirely.
if is_first_keyword and delay_site_open > 0:
print(
f"[Scraper] 🌐 Website-launch delay: {delay_site_open}s "
f"{site.name} homepage opened, holding before search"
)
await asyncio.sleep(delay_site_open)
print(f"[Scraper] ✅ Website-launch delay done")
# Step 2 — Search Discovery: semantic locators → CSS fallback
try:
search_el = await _discover_search_input(page, sel, site.name)
# ── Robust input — works minimised / in background ────────────
# Uses only Playwright Locator methods which are JS-driven
# internally. No OS window focus is ever needed.
#
# Why NOT page.evaluate() + element_handle():
# _discover_search_input returns a Locator. Calling
# locator.element_handle() returns None when the element is
# not uniquely resolved at that instant (e.g. during a
# re-navigation) — passing None into page.evaluate() means
# `el` is null inside the JS, so el.focus() throws
# "TypeError: el.focus is not a function".
#
# Locator.focus() / .fill() / .press() resolve the element
# fresh on every call, retry automatically on transient
# detachment, and inject their actions via CDP (Chrome
# DevTools Protocol) — no OS keyboard or mouse events.
# 1. Focus via Locator — CDP-driven, no OS focus needed
await search_el.focus()
await asyncio.sleep(_jitter(0.4, pct=0.4))
# 2. Type search term — mode depends on humanize_level
await search_el.fill("") # clear first
_hlvl_type = humanize_level.strip().lower()
if _hlvl_type == "raw":
# Raw: instant fill — no timing simulation at all
await search_el.fill(keyword.term)
elif _hlvl_type == "low":
# Low: fill in one shot, small pre/post pause
await asyncio.sleep(random.uniform(0.1, 0.3))
await search_el.fill(keyword.term)
await asyncio.sleep(random.uniform(0.1, 0.3))
elif _hlvl_type == "medium":
# Medium: char-by-char typing, variable WPM, no typos
await asyncio.sleep(_jitter(0.3, pct=0.4))
for char in keyword.term:
await search_el.press(char)
if char == " ":
await asyncio.sleep(random.uniform(0.10, 0.25))
else:
await asyncio.sleep(random.uniform(0.05, 0.10))
await asyncio.sleep(_jitter(0.4, pct=0.4))
else: # heavy
# Heavy: char-by-char, variable WPM, 12% typo+backspace,
# word boundary rhythm, pre-submit re-read pause
await asyncio.sleep(_jitter(0.5, pct=0.5))
typo_chars = "qwertyuiopasdfghjklzxcvbnm"
for char in keyword.term:
if random.random() < 0.12 and len(keyword.term) > 3:
wrong = random.choice(typo_chars)
await search_el.press(wrong)
await asyncio.sleep(random.uniform(0.08, 0.18))
await search_el.press("Backspace")
await asyncio.sleep(random.uniform(0.05, 0.15))
await search_el.press(char)
if char == " ":
await asyncio.sleep(random.uniform(0.12, 0.35))
else:
await asyncio.sleep(random.uniform(0.045, 0.110))
# Pre-submit pause — user re-reads what they typed
await asyncio.sleep(_jitter(0.6, pct=0.5))
# 3. Dispatch an explicit 'input' event as belt-and-braces
await search_el.dispatch_event("input")
# 4. Locator.press() sends Enter via CDP
await search_el.press("Enter")
# Step 3 — wait for results page to settle
await page.wait_for_load_state("networkidle", timeout=60_000)
except Exception as sel_exc:
print(
f"[Scraper] ❌ {site.name}: Search Discovery failed — "
f"{sel_exc}"
)
return 0 # bail — don't scrape the homepage itself
# ── N17: Try AI-generated selectors first ────────────────────────────
# If this site has been auto-adapted (SiteSelectors row with confidence
# >= 50 and not marked stale), use those precise selectors to extract
# directly. On success, jump straight to the hold/loop logic.
# On failure (0 results), mark stale and fall through to JS_EXTRACT.
_ai_adapted_rows: list[dict] = []
_ss_row = None
try:
_ss_db = SessionLocal()
_ss_row = _ss_db.query(SiteSelectors).filter(
SiteSelectors.site_id == site.id,
SiteSelectors.confidence >= 50,
SiteSelectors.stale == False, # noqa: E712
).first()
_ss_db.close()
except Exception:
_ss_row = None
if _ss_row:
print(f"[AutoAdapt] ⚡ {site.name}: using AI selectors "
f"(conf={_ss_row.confidence}, container='{_ss_row.container_sel}')")
_ai_adapted_rows = await _extract_with_selectors(page, _ss_row)
if _ai_adapted_rows:
print(f"[AutoAdapt] ✅ {site.name}: {len(_ai_adapted_rows)} rows via AI selectors")
# Update last_tested_at
try:
_upd_db = SessionLocal()
_upd_ss = _upd_db.query(SiteSelectors).filter(SiteSelectors.site_id == site.id).first()
if _upd_ss:
_upd_ss.last_tested_at = datetime.now()
_upd_db.flush()
_upd_db.commit()
_upd_db.close()
except Exception:
pass
else:
# AI selectors returned nothing — mark stale, fall through to JS_EXTRACT
print(f"[AutoAdapt] ⚠️ {site.name}: AI selectors returned 0 rows — marking stale")
try:
_stale_db = SessionLocal()
_stale_ss = _stale_db.query(SiteSelectors).filter(SiteSelectors.site_id == site.id).first()
if _stale_ss:
_stale_ss.stale = True
_stale_db.flush()
_stale_db.commit()
_stale_db.close()
except Exception:
pass
# Auto-heal: if Auto-Adapter is enabled, queue a re-adapt in background
if _get_config("auto_adapt_enabled", "false").lower() == "true":
print(f"[AutoAdapt] 🔄 Queuing background re-adapt for {site.name}")
asyncio.create_task(adapt_site_now(site.id))
# ── Collect listing elements ─────────────────────────────────────────
# These selectors are tried in order; the first one that returns
# results wins. ShopGoodwill items match div.lot-card / .item-card.
listing_selectors = [
"li.s-item", # eBay
".item-cell",
"article.product-pod",
"div.lot-card", # ShopGoodwill
".item-card", # ShopGoodwill alternate
"div.listing-item",
"[data-listing-id]",
"div[class*='result']",
"li[class*='product']",
]
items = []
for sel_try in listing_selectors:
items = await page.query_selector_all(sel_try)
if items:
print(
f"[Scraper] {site.name}: matched {len(items)} items "
f"via '{sel_try}'"
)
break
if not items:
# Last-resort: any anchor whose href looks like a product page
items = await page.query_selector_all(
"a[href*='itm'], a[href*='listing'], a[href*='item'], "
"a[href*='/product/'], a[href*='/lot/']"
)
if items:
print(
f"[Scraper] {site.name}: fallback anchor match "
f"({len(items)} items)"
)
if not items:
print(f"[Scraper] ⚠️ {site.name}: no listing elements found on page.")
# ── Delay 2: post-search pause before scraping ────────────────────
if delay_post_search > 0:
print(
f"[Scraper] ⏳ Post-search delay: {delay_post_search}s "
f"— waiting {delay_post_search}s before scraping "
f"({site.name} | '{keyword.term}')"
)
await asyncio.sleep(delay_post_search)
print(f"[Scraper] ✅ Post-search delay done — starting scrape")
# ── Human simulation — level-gated ──────────────────────────────────
# raw: no simulation at all — bare requests, fastest, least safe
# low: one quick mouse move + one scroll pass
# medium: mouse + scroll + post-scroll idle
# heavy: full 5-step sequence with long idles and re-read behaviour
_hlvl = humanize_level.strip().lower()
if _hlvl == "raw":
pass # no simulation whatsoever
elif _hlvl == "low":
await _human_mouse(page)
await asyncio.sleep(_jitter(0.4, pct=0.4))
await _human_scroll(page, steps=random.randint(1, 2))
await asyncio.sleep(_jitter(0.4, pct=0.4))
elif _hlvl == "medium":
await asyncio.sleep(_jitter(0.7, pct=0.4)) # brief page-load idle
await _human_mouse(page)
await asyncio.sleep(_jitter(0.5, pct=0.4))
await _human_scroll(page, steps=random.randint(2, 4))
await asyncio.sleep(_jitter(0.8, pct=0.4)) # post-scroll idle
else: # heavy (default)
# Full 5-step sequence:
# 1. Brief idle — page just loaded, user orients themselves
# 2. Mouse moves toward content area
# 3. Scroll through results with read-rhythm pauses
# 4. More hover-reading
# 5. "Thinking" pause before acting
await asyncio.sleep(_jitter(1.2, pct=0.5)) # page-load idle
await _human_mouse(page) # initial cursor
await asyncio.sleep(_jitter(0.8, pct=0.5))
await _human_scroll(page, steps=random.randint(4, 7)) # read results
await asyncio.sleep(_jitter(0.9, pct=0.5))
await _human_mouse(page) # hover-reading
await asyncio.sleep(_jitter(1.5, pct=0.6)) # thinking pause
# ── Delay 3: page-hold re-scrape loop ───────────────────────────────
# Holds the results page for exactly `delay_page_hold` seconds total.
# The scraper runs a full pass, and as soon as that pass completes it
# checks if time remains — if yes it immediately starts the next pass
# with no idle wait between passes. The loop only ends when the hold
# timer has fully expired OR on the first pass when no hold is set.
# The DB unique-link constraint deduplicates across all passes —
# each listing URL is written exactly once, no clones ever saved.
_hold_deadline = time.time() + (delay_page_hold if delay_page_hold > 0 else 0)
_pass_num = 0
while True:
_pass_num += 1
_pass_new = 0
_pass_start = time.time()
# Always re-query the DOM on every pass so any items that loaded
# after the initial page-settle are captured.
items_current = []
for sel_try in listing_selectors:
items_current = await page.query_selector_all(sel_try)
if items_current:
break
if not items_current:
items_current = await page.query_selector_all(
"a[href*='itm'], a[href*='listing'], a[href*='item'], "
"a[href*='/product/'], a[href*='/lot/']"
)
items_current = items_current[:30]
# Log every pass — show remaining hold time
if delay_page_hold > 0:
_remaining = max(0, int(_hold_deadline - time.time()))
print(
f"[Scraper] 🔁 Page-hold pass #{_pass_num}"
f"{_remaining}s / {delay_page_hold}s remaining "
f"({site.name} | '{keyword.term}')"
)
# ── Page-level parallel extraction ────────────────────────────────
# If AI selectors produced rows, use them directly (skip JS_EXTRACT).
# Otherwise run the universal JS extractor as before.
if _ai_adapted_rows:
page_data = _ai_adapted_rows
else:
page_data = await page.evaluate(JS_EXTRACT)
from urllib.parse import urljoin
for row in (page_data or []):
try:
title = row.get("title", "").strip()
price_text = row.get("price_text", "").strip()
time_text = row.get("time_text", "").strip()
location = row.get("location", "").strip()
href = row.get("href", "").strip()
images_list = row.get("images", [])
if not title or len(title) < 5:
continue
if href and not href.startswith("http"):
href = urljoin(page.url, href)
score = calculate_attribute_score(title, keyword.weight)
# ── Extract price and check N7 price filters ───────────────────
amount, currency = _extract_price_and_currency(price_text)
# N7: per-keyword price filter
kw_min = keyword.min_price
kw_max = keyword.max_price
if amount is not None:
if kw_min is not None and amount < kw_min:
print(f"[N7] ⬇️ Skipping '{title[:40]}' — price {amount} below min {kw_min}")
continue
if kw_max is not None and amount > kw_max:
print(f"[N7] ⬆️ Skipping '{title[:40]}' — price {amount} above max {kw_max}")
continue
# ── N16: AI filter — runs when keyword has an ai_target ───
_ai_enabled = _get_config("ai_filter_enabled", "false").lower() == "true"
_scoring_on = _get_config("scoring_enabled", "true").lower() == "true"
_ai_target = (keyword.ai_target or "").strip()
_ai_match_val = None
_ai_reason_val = None
if _ai_enabled and _ai_target:
# AI is the judge — score is calculated for display only.
# Score gate is bypassed regardless of scoring_enabled setting.
_ai_match_val, _ai_reason_val = await _ai_analyze(title, _ai_target)
if not _ai_match_val:
# AI rejected — save to DB as rejected (ai_match=0) but don't alert
_stats["total_scanned"] += 1
if not (href and db.query(Listing).filter(Listing.link == href).first()):
amount_rej, currency_rej = _extract_price_and_currency(price_text)
listing_rej = Listing(
title=title[:500],
price=amount_rej,
currency=currency_rej[:10] if currency_rej else "",
price_raw=_format_price(amount_rej, currency_rej)[:100],
time_left=_extract_time_left(time_text)[:60],
link=href or f"no-link-{random.randint(0,999999)}",
score=score,
keyword=keyword.term,
site_name=site.name,
location=location or "",
ai_match=0,
ai_reason=_ai_reason_val[:200] if _ai_reason_val else None,
images=json.dumps(images_list[:10]) if images_list else None,
)
db.add(listing_rej)
db.flush()
db.commit()
continue
elif _scoring_on:
# No AI target — fall back to score gate (only when scoring is enabled)
if score < 0:
continue
# else: scoring disabled AND no AI target → all lots pass through
_stats["total_scanned"] += 1
if href and db.query(Listing).filter(Listing.link == href).first():
continue
# ── N11: Cross-site deduplication (eBay only) ─────────────────
# If the same title already exists on a different eBay region
# within the last 24h, suppress the duplicate listing + alert.
_is_ebay = "ebay" in site.name.lower() or "ebay" in (site.url_template or "").lower()
if _is_ebay:
_cutoff = datetime.now() - timedelta(hours=24)
_recent_other_ebay = db.query(Listing).filter(
Listing.timestamp >= _cutoff,
Listing.site_name != site.name,
Listing.site_name.ilike("%ebay%"),
).all()
_is_cross_dupe = any(
difflib.SequenceMatcher(None, title.lower(), r.title.lower()).ratio() > 0.85
for r in _recent_other_ebay
)
if _is_cross_dupe:
print(f"[N11] 🔁 Cross-site duplicate suppressed: '{title[:50]}'")
continue
price_display = _format_price(amount, currency)
time_left_str = _extract_time_left(time_text)
listing = Listing(
title=title[:500],
price=amount,
currency=currency[:10] if currency else "",
price_raw=price_display[:100],
time_left=time_left_str[:60],
time_left_mins=round(timeLeftToMins(time_left_str), 4) if time_left_str and timeLeftToMins(time_left_str) != float('inf') else None,
price_updated_at=datetime.now(),
link=href or f"no-link-{random.randint(0,999999)}",
score=score,
keyword=keyword.term,
site_name=site.name,
location=location or "",
ai_match=1 if (_ai_enabled and _ai_target) else None,
ai_reason=_ai_reason_val[:200] if _ai_reason_val else None,
images=json.dumps(images_list[:10]) if images_list else None,
)
db.add(listing)
db.flush()
db.commit()
# N4: store USD price for cross-site sorting
if listing.price and listing.currency:
listing.price_usd = _convert_price(listing.price, listing.currency, "USD")
if listing.price_usd:
db.flush()
db.commit()
new_count += 1
_pass_new += 1
_stats["total_alerts"] += 1
_new_listing_links.append((listing.id, listing.link))
_redis_publish("new_listing", {
"id": listing.id, "title": title[:80],
"price": price_display, "site": site.name,
"keyword": keyword.term, "score": score,
})
# ── Alert (with AI verdict if applicable) ─────────────────
_ai_line = f"🤖 AI: ✅ {_ai_reason_val}\n" if (_ai_enabled and _ai_target and _ai_reason_val) else ""
alert = (
f"🎯 <b>Ghost Node — New Hit</b>\n"
f"📦 {title[:80]}\n"
f"💰 {price_display or 'Price unknown'}"
+ (f" | ⏳ {time_left_str}" if time_left_str else "") + "\n"
+ f"🏷️ Keyword: <i>{keyword.term}</i> | Score: {score}\n"
+ _ai_line
+ f"🌐 Site: {site.name}\n"
f"🔗 {href[:200]}"
)
asyncio.create_task(send_alert(alert, subject=f"Ghost Node — {title[:40]}"))
except Exception as row_exc:
print(f"[Scraper] row parse error: {row_exc}")
continue
# Always log pass summary
_pass_elapsed = round(time.time() - _pass_start, 1)
_remaining_after = max(0, int(_hold_deadline - time.time()))
print(
f"[Scraper] ✓ Pass #{_pass_num} complete in {_pass_elapsed}s — "
f"{_pass_new} new | {new_count} total"
+ (f" | {_remaining_after}s hold remaining — re-scraping now" if delay_page_hold > 0 and _remaining_after > 0 else "")
)
# Exit: no hold timer set, or hold timer has now expired
if delay_page_hold <= 0 or time.time() >= _hold_deadline:
break
# Time remains — start next pass immediately (no idle wait)
# ── N5: Pagination — follow "Next page" up to site.max_pages ─────────
# Runs AFTER the page-hold loop so we only paginate when all passes
# on the current page are complete. Each new page resets the hold timer.
_max_pg = max(1, site.max_pages or 1)
_cur_pg = 1
while _cur_pg < _max_pg:
_went = await _go_next_page(page)
if not _went:
break
_cur_pg += 1
print(f"[Scraper] 📄 {site.name} | '{keyword.term}' → page {_cur_pg}/{_max_pg}")
# Brief human pause between pages
await asyncio.sleep(_jitter(2.5, pct=0.4))
if _hlvl not in ("raw",):
await _human_scroll(page, steps=random.randint(2, 4))
# Re-extract listing items on the new page
items_pg = []
for sel_try in listing_selectors:
items_pg = await page.query_selector_all(sel_try)
if items_pg:
break
if not items_pg:
items_pg = await page.query_selector_all(
"a[href*='itm'], a[href*='listing'], a[href*='item'], "
"a[href*='/product/'], a[href*='/lot/']"
)
# Run the same page-level extraction on the new page
try:
rows_pg = await page.evaluate(JS_EXTRACT)
except Exception:
rows_pg = []
for row in (rows_pg or [])[:50]:
try:
title = (row.get("title") or "").strip()
price_text = (row.get("price_text") or "").strip()
time_text = (row.get("time_text") or "").strip()
href = (row.get("href") or "").strip()
images_list2 = row.get("images", [])
if not title or len(title) < 5:
continue
if href and not href.startswith("http"):
from urllib.parse import urljoin
href = urljoin(page.url, href)
score = calculate_attribute_score(title, keyword.weight)
# ── N16: AI filter (pagination) ───────────────────────────
_ai_en2 = _get_config("ai_filter_enabled", "false").lower() == "true"
_ai_tgt2 = (keyword.ai_target or "").strip()
_ai_match2 = None
_ai_reason2 = None
if _ai_en2 and _ai_tgt2:
_ai_match2, _ai_reason2 = await _ai_analyze(title, _ai_tgt2)
if not _ai_match2:
_stats["total_scanned"] += 1
if not (href and db.query(Listing).filter(Listing.link == href).first()):
_amt_r, _cur_r = _extract_price_and_currency(price_text)
db.add(Listing(
title=title[:500], price=_amt_r,
currency=_cur_r[:10] if _cur_r else "",
price_raw=_format_price(_amt_r, _cur_r)[:100],
time_left=_extract_time_left(time_text)[:60],
link=href or f"no-link-pg{_cur_pg}-{random.randint(0,999999)}",
score=score, keyword=keyword.term, site_name=site.name,
ai_match=0,
ai_reason=_ai_reason2[:200] if _ai_reason2 else None,
images=json.dumps(images_list2[:10]) if images_list2 else None,
))
db.flush()
db.commit()
continue
else:
if score < 0:
continue
_stats["total_scanned"] += 1
if href and db.query(Listing).filter(Listing.link == href).first():
continue
amount, currency = _extract_price_and_currency(price_text)
price_display = _format_price(amount, currency)
time_left_str = _extract_time_left(time_text)
listing = Listing(
title=title[:500], price=amount,
currency=currency[:10] if currency else "",
price_raw=price_display[:100],
time_left=time_left_str[:60],
time_left_mins=round(timeLeftToMins(time_left_str), 4) if time_left_str and timeLeftToMins(time_left_str) != float("inf") else None,
price_updated_at=datetime.now(),
link=href or f"no-link-pg{_cur_pg}-{random.randint(0,999999)}",
score=score, keyword=keyword.term, site_name=site.name,
ai_match=1 if (_ai_en2 and _ai_tgt2) else None,
ai_reason=_ai_reason2[:200] if _ai_reason2 else None,
images=json.dumps(images_list2[:10]) if images_list2 else None,
)
db.add(listing)
db.flush()
db.commit()
new_count += 1
_stats["total_alerts"] += 1
_new_listing_links.append((listing.id, listing.link))
_tl_pg = f" | ⏳ {time_left_str}" if time_left_str else ""
_ai_line2 = f"🤖 AI: ✅ {_ai_reason2}\n" if (_ai_en2 and _ai_tgt2 and _ai_reason2) else ""
alert = (
"🎯 <b>Ghost Node — New Hit</b>"
f" (p{_cur_pg})\n"
f"📦 {title[:80]}\n"
f"💰 {price_display or 'Price unknown'}{_tl_pg}\n"
f"🏷️ Keyword: <i>{keyword.term}</i> | Score: {score}\n"
+ _ai_line2
+ f"🌐 Site: {site.name}\n"
f"🔗 {href[:200]}"
)
asyncio.create_task(send_alert(alert, subject=f"Ghost Node — {title[:40]}"))
except Exception:
continue
print(f"[Scraper] ✓ {site.name} | '{keyword.term}' page {_cur_pg}{new_count} total new")
except Exception as nav_exc:
# Baghdad Optimization — single site failure never crashes the engine
print(f"[Scraper] ⚠️ {site.name} | {keyword.term}{nav_exc}")
_record_site_error(site.id, str(nav_exc)[:400])
pending_reason = f"navigation failed: {str(nav_exc)[:200]}"
_update_round_item("pending", pending_reason)
return {"status": "pending", "new_count": new_count, "pending_reason": pending_reason}
# ── Immediate detail-page image fetch for new listings ──────────────────
# Search results pages only have thumbnails. Visit each new lot's detail
# page NOW (same browser context) to grab all images immediately instead
# of waiting for the 5-minute price refresh pass.
if _new_listing_links:
print(f"[Images] 🖼️ Fetching detail images for {len(_new_listing_links)} new listing(s)…")
try:
_img_updated = await _fetch_listing_images_batch(
page.context, _new_listing_links, db
)
print(f"[Images] ✅ {_img_updated}/{len(_new_listing_links)} listings got full images")
except Exception as img_exc:
print(f"[Images] ⚠️ Batch image fetch failed: {img_exc}")
# ── N13: Record success after clean completion ─────────────────────────────
_record_site_success(site.id)
_update_round_item("done", None)
return {"status": "done", "new_count": new_count, "pending_reason": None}
# ─────────────────────────────────────────────────────────────────────────────
# N9 — Closing-Soon Alert Loop (Thread E)
# ─────────────────────────────────────────────────────────────────────────────
async def closing_alert_loop() -> None:
"""
Multi-interval closing alert loop.
Config: closing_alert_enabled (true/false)
closing_alert_schedule (comma-separated minutes, e.g. "60,30,10,5")
Use "0" to disable all closing alerts while keeping capture alerts.
Each lot can fire multiple alerts — one per configured threshold.
Tracks fired intervals in Listing.closing_alerts_sent (JSON list).
"""
print("[Thread E] Closing-alert loop online.")
while True:
try:
enabled = _get_config("closing_alert_enabled", "false").lower() == "true"
if enabled:
schedule_raw = _get_config("closing_alert_schedule", "30").strip()
# Parse thresholds — "0" means no closing alerts at all
thresholds: list[float] = []
for t in schedule_raw.split(","):
t = t.strip()
if t and t != "0":
try:
thresholds.append(float(t))
except ValueError:
pass
thresholds = sorted(set(thresholds), reverse=True) # e.g. [60, 30, 10, 5]
if thresholds:
db = SessionLocal()
try:
from datetime import timedelta
stale_cutoff = datetime.now() - timedelta(days=7)
max_threshold = max(thresholds)
candidates = (
db.query(Listing)
.filter(
Listing.time_left_mins != None,
Listing.time_left_mins > 0,
Listing.timestamp >= stale_cutoff,
)
.all()
)
for lot in candidates:
if not lot.timestamp:
continue
ref_time = lot.price_updated_at or lot.timestamp
elapsed_mins = (datetime.now() - ref_time).total_seconds() / 60.0
remaining = (lot.time_left_mins or 0) - elapsed_mins
if remaining <= 0:
# Mark all thresholds as sent to avoid re-processing
lot.closing_alerts_sent = json.dumps(thresholds)
db.flush()
db.commit()
continue
# Load which intervals have already fired
try:
fired: list = json.loads(lot.closing_alerts_sent or "[]")
except Exception:
fired = []
for threshold in thresholds:
if threshold in fired:
continue # already sent for this interval
if remaining <= threshold:
mins_int = int(remaining)
secs_int = int((remaining - mins_int) * 60)
time_str = (
f"{mins_int}m {secs_int:02d}s"
if mins_int < 60
else f"{int(remaining/60)}h {mins_int % 60}m"
)
alert = (
f"⏰ <b>CLOSING SOON — {time_str} left!</b>\n"
f"📦 {lot.title[:80]}\n"
f"💰 {lot.price_raw or 'Price unknown'}\n"
f"🏷️ Keyword: <i>{lot.keyword}</i> | Score: {lot.score}\n"
f"🌐 Site: {lot.site_name}\n"
f"🔗 {lot.link[:200]}"
)
asyncio.create_task(send_alert(alert, subject=f"CLOSING SOON — {lot.title[:40]}"))
fired.append(threshold)
lot.closing_alerts_sent = json.dumps(fired)
db.flush()
db.commit()
print(f"[Thread E] ⏰ @{threshold}min alert: {lot.title[:50]} ({time_str} left)")
finally:
db.close()
except Exception as exc:
print(f"[Thread E] Error: {exc}")
await asyncio.sleep(60)
def run_closing_alert_thread() -> None:
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
loop.run_until_complete(closing_alert_loop())
async def nuclear_engine() -> None:
"""
Main scraper loop — runs forever.
Pulls a FRESH copy of TargetSites + Config from the DB at the TOP of
every cycle, so any site/keyword added via the UI is immediately active.
"""
print("[Thread B] Nuclear engine igniting…")
_stats["engine_status"] = "Running"
async with async_playwright() as pw:
while True:
if _stats["engine_status"] == "Paused":
await asyncio.sleep(10)
continue
# ── N8: Scrape-window check — skip cycle outside allowed hours ────
_win_enabled = _get_config("scrape_window_enabled", "false").lower() == "true"
if _win_enabled:
_now_hour = datetime.now().hour
_start_h = int(_get_config("scrape_start_hour", "8"))
_end_h = int(_get_config("scrape_end_hour", "22"))
# Handles same-day windows (08:0022:00) and overnight windows (22:0006:00)
if _start_h <= _end_h:
_in_window = _start_h <= _now_hour < _end_h
else: # overnight window e.g. 2206
_in_window = _now_hour >= _start_h or _now_hour < _end_h
if not _in_window:
print(
f"[Thread B] 🌙 Outside scrape window ({_start_h:02d}:00{_end_h:02d}:00) "
f"— current hour {_now_hour:02d}:xx. Sleeping 5min."
)
await asyncio.sleep(300)
continue
# ── Pull live config from DB — fresh session every cycle ──────────
db = SessionLocal()
try:
keywords = db.query(Keyword).order_by(Keyword.sort_order.asc(), Keyword.id.asc()).all()
# Only rows where enabled is explicitly 1.
# We also audit and log any disabled rows so the operator
# can confirm that toggling a site off actually prevents
# the engine from touching it.
target_sites = db.query(TargetSite).filter(TargetSite.enabled == 1).all()
disabled_sites = db.query(TargetSite).filter(TargetSite.enabled != 1).all()
timer_val = int(_get_config("timer", "120"))
delay_launch = int(_get_config("delay_launch", "0"))
delay_site_open = int(_get_config("delay_site_open", "0"))
delay_search = int(_get_config("delay_post_search", "0"))
delay_hold = int(_get_config("delay_page_hold", "0"))
humanize_level = _get_config("humanize_level", "heavy").strip().lower()
if humanize_level not in ("raw", "low", "medium", "heavy"):
humanize_level = "heavy"
print(f"[Thread B] 🎭 Humanize level: {humanize_level.upper()}")
# ── Parallel execution settings ────────────────────────────────
max_concurrent_browsers = max(1, int(_get_config("max_concurrent_browsers", "1")))
max_tabs_per_site = max(1, int(_get_config("max_tabs_per_site", "1")))
keyword_batch_enabled = _get_config("keyword_batch_enabled", "false").lower() == "true"
finally:
db.close()
# N4: Refresh FX rates once per cycle
await _get_fx_rates()
# ── Log exactly which sites the engine will scrape this cycle ─────
if disabled_sites:
skipped = ", ".join(f"'{s.name}'" for s in disabled_sites)
print(f"[Thread B] ⏭️ Skipping disabled site(s): {skipped}")
if target_sites:
site_names = ", ".join(f"'{s.name}'" for s in target_sites)
print(f"[Thread B] 🔄 Cycle starting — {len(target_sites)} site(s): {site_names}")
else:
print("[Thread B] ⚠️ No enabled TargetSites in DB — sleeping 60s.")
await asyncio.sleep(60)
continue
if not keywords:
print("[Thread B] ⚠️ No Keywords in DB — sleeping 60s.")
await asyncio.sleep(60)
continue
_stats["engine_status"] = "Running"
_redis_set_stats(_stats)
_redis_publish("engine_status", {"status": "Running"})
cycle_start = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
# Keyword batching/progress tracking (optional)
active_round_id: int | None = None
selected_by_site: dict[int, list[tuple[Keyword, int | None]]] = {}
if keyword_batch_enabled:
from datetime import timedelta
now_dt = datetime.now()
round_deadline = now_dt + timedelta(hours=4)
def _create_round_and_items(db_round: Session, sites, kws) -> ScrapeRound:
r = ScrapeRound(started_at=datetime.now(), status="active")
db_round.add(r)
db_round.flush()
for s in sites:
for kw in kws:
db_round.add(
ScrapeRoundItem(
round_id=r.id,
site_id=s.id,
keyword_id=kw.id,
status="pending",
attempt_count=0,
first_pending_at=None,
last_attempt_at=None,
last_error=None,
last_hour_warn_at=None,
)
)
db_round.flush()
return r
db_round = SessionLocal()
try:
# Load (or create) active round.
active_round = (
db_round.query(ScrapeRound)
.filter(ScrapeRound.status == "active")
.order_by(ScrapeRound.started_at.desc())
.first()
)
if not active_round:
active_round = _create_round_and_items(db_round, target_sites, keywords)
db_round.commit()
else:
# If retry window expired, finish + start a new round now.
deadline = active_round.started_at + timedelta(hours=4)
if datetime.now() >= deadline:
# Mark all remaining pending/in-progress keyword attempts as failed.
for ri in db_round.query(ScrapeRoundItem).filter(
ScrapeRoundItem.round_id == active_round.id,
ScrapeRoundItem.status.in_(["pending", "in_progress"]),
).all():
ri.status = "failed"
ri.last_error = "Retry window expired (4h)"
ri.last_attempt_at = datetime.now()
db_round.flush()
active_round.status = "finished"
active_round.finished_at = datetime.now()
db_round.flush()
active_round = _create_round_and_items(db_round, target_sites, keywords)
db_round.commit()
active_round_id = active_round.id
# Ensure round items exist for current (enabled sites × keywords).
# Insert any missing pairs as pending.
existing = (
db_round.query(ScrapeRoundItem.site_id, ScrapeRoundItem.keyword_id)
.filter(ScrapeRoundItem.round_id == active_round_id)
.all()
)
existing_pairs = {(sid, kid) for sid, kid in existing}
missing = []
for s in target_sites:
for kw in keywords:
if (s.id, kw.id) not in existing_pairs:
missing.append(
ScrapeRoundItem(
round_id=active_round_id,
site_id=s.id,
keyword_id=kw.id,
status="pending",
attempt_count=0,
first_pending_at=None,
)
)
if missing:
db_round.add_all(missing)
db_round.flush()
db_round.commit()
# Select up to max_tabs_per_site pending keywords per site.
pending_items = (
db_round.query(ScrapeRoundItem)
.filter(
ScrapeRoundItem.round_id == active_round_id,
ScrapeRoundItem.status == "pending",
)
.all()
)
pending_map: dict[tuple[int, int], ScrapeRoundItem] = {
(ri.site_id, ri.keyword_id): ri for ri in pending_items
}
selected_pairs: dict[int, list[tuple[Keyword, int | None]]] = {
s.id: [] for s in target_sites
}
now_sel = datetime.now()
for s in target_sites:
for kw in keywords:
ri = pending_map.get((s.id, kw.id))
if not ri:
continue
selected_pairs[s.id].append((kw, ri.id))
ri.status = "in_progress"
ri.attempt_count = (ri.attempt_count or 0) + 1
ri.last_attempt_at = now_sel
ri.last_error = None
db_round.flush()
if len(selected_pairs[s.id]) >= max_tabs_per_site:
break
db_round.commit()
selected_by_site = selected_pairs
finally:
db_round.close()
# ── Parallel execution: one coroutine per site ────────────────────
# Semaphore caps how many browsers can be active at the same time.
# When max_concurrent_browsers=1 (default) behaviour is identical
# to the old sequential loop — no risk, no change in output.
_browser_sem = asyncio.Semaphore(max_concurrent_browsers)
async def _scrape_one_site(site) -> None:
"""Launch a browser for one site, scrape all keywords (optionally
in parallel tabs), then close the browser."""
async with _browser_sem:
batch_pairs: list[tuple[Keyword, int | None]]
if keyword_batch_enabled:
batch_pairs = selected_by_site.get(site.id, [])
if not batch_pairs:
return
else:
batch_pairs = [(kw, None) for kw in keywords]
# ── Resolve settings inside the coroutine (thread-safe reads) ──
_browser_label, _browser_exe = _resolve_browser()
_incognito = _get_config("incognito_mode", "false").lower() == "true"
_global_show_browser = _get_config("show_browser", "false").lower() == "true"
# Per-site override:
# - if global show_browser=true -> visible for all sites
# - else -> site becomes visible only if custom_visible_browser=1
_site_custom_visible = int(getattr(site, "custom_visible_browser", 0) or 0) == 1
_show_browser = _global_show_browser or _site_custom_visible
_headless = not _show_browser
if _show_browser:
print(
f"[Browser] 👁️ VISIBLE MODE — browser window will open on screen. "
f"Close Ghost Node or toggle off in Settings when done debugging."
)
_launch_args = [
"--no-sandbox",
# ── Background-throttling kill switches ───────────────
# These flags tell Chromium's internal scheduler and
# renderer to treat this browser exactly the same whether
# it is the foreground window, minimised, or behind other
# windows. No OS focus is ever needed.
#
# Without these flags Chromium intentionally slows down
# background tabs: JS timers fire at 1 Hz instead of
# normal rate, GPU compositing pauses, and wake-locks are
# dropped — all of which cause silent scraping failures.
# Prevents the renderer process from being deprioritised
# when the window loses OS focus or is minimised
"--disable-renderer-backgrounding",
# Prevents background tabs from having their JS timers
# throttled to 1-second intervals
"--disable-background-timer-throttling",
# Prevents Chromium from pausing rendering for windows
# that are fully hidden behind other windows
"--disable-backgrounding-occluded-windows",
# Keeps the GPU process and compositor running at full
# speed even when no visible frame is being presented
"--disable-background-networking",
# Forces Chromium to keep all tabs at the same priority
# regardless of visibility
"--force-fieldtrials=BackgroundTabStopping/disable",
]
# Pick a consistent agent profile — UA, platform, language,
# locale and timezone all match so HTTP headers agree with
# navigator properties (detectors cross-check these).
_profile = random.choice(_agent_profiles)
_vw, _vh = _profile["viewport"]
_launch_kwargs_base: dict = {
"headless": _headless,
"args": _launch_args,
"user_agent": _profile["ua"],
"viewport": {"width": _vw, "height": _vh},
"locale": _profile["locale"],
"timezone_id": _profile["tz"],
"extra_http_headers": {
"Accept-Language": _profile["lang"],
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
},
}
if _browser_exe:
_launch_kwargs_base["executable_path"] = _browser_exe
# N1: Add proxy if enabled
_proxy = _get_proxy()
if _proxy:
_launch_kwargs_base["proxy"] = _proxy
_visibility_tag = "VISIBLE 👁" if _show_browser else "headless"
try:
if _incognito:
browser = await pw.chromium.launch(
headless=_headless,
args=_launch_args,
**({"executable_path": _browser_exe} if _browser_exe else {}),
)
context = await browser.new_context(
user_agent=_profile["ua"],
viewport={"width": _vw, "height": _vh},
locale=_profile["locale"],
timezone_id=_profile["tz"],
)
print(f"[Browser] 🕵️ Launched {_browser_label} — INCOGNITO + {_visibility_tag}")
else:
# Persistent: per-site profile dirs so cookie jars
# don't bleed across different sites.
_site_slug = re.sub(r"[^\w]", "_", site.name.lower())[:20]
_profile_dir = os.path.join(
os.path.dirname(__file__), ".browser_profiles", _site_slug
)
os.makedirs(_profile_dir, exist_ok=True)
context = await pw.chromium.launch_persistent_context(
_profile_dir,
**_launch_kwargs_base,
)
browser = None # persistent context manages its own lifecycle
print(f"[Browser] 🚀 Launched {_browser_label} — NORMAL + {_visibility_tag}")
# ── 30-property stealth init script ──────────────────────
# Built from the selected agent profile so every property
# (UA, platform, language, WebGL renderer, canvas noise,
# audio noise, screen size, timing) is internally consistent.
await context.add_init_script(_build_stealth_script(_profile))
# ── Delay 1: post-launch settle time ─────────────────────
if delay_launch > 0:
print(
f"[Browser] ⏳ Post-launch delay: {delay_launch}s "
f"— browser open, waiting {delay_launch}s before first navigation"
)
await asyncio.sleep(delay_launch)
print(f"[Browser] ✅ Post-launch delay done — navigating now")
# ── Tab-level semaphore: caps parallel keywords per site ──
# Each keyword runs in its own page (tab) within this
# browser context. Tabs share cookies/session (good for
# sites that require login) but have independent V8 runtimes
# and network stacks so they don't block each other.
# Each tab gets its own DB session to avoid SQLAlchemy
# thread-local conflicts across concurrent coroutines.
_tab_sem = asyncio.Semaphore(max_tabs_per_site)
async def _scrape_one_keyword(kw, round_item_id: int | None, is_first: bool) -> None:
async with _tab_sem:
page = await context.new_page()
await page.route(
"**/*.{png,jpg,jpeg,gif,svg,woff,woff2,ttf,mp4,webp}",
lambda route: route.abort(),
)
_kw_db = SessionLocal()
try:
outcome = await scrape_site(
page, site, kw, _kw_db,
delay_post_search=delay_search,
delay_page_hold=delay_hold,
delay_site_open=delay_site_open,
is_first_keyword=is_first,
humanize_level=humanize_level,
round_item_id=round_item_id,
)
found = int(outcome.get("new_count") or 0)
print(
f"[Scraper] ✓ {site.name} | '{kw.term}' "
f"{found} new ({outcome.get('status')})"
)
finally:
_kw_db.close()
await page.close()
if max_tabs_per_site == 1:
# Sequential mode — preserve the original inter-keyword
# jitter so timing patterns stay natural.
for _kw_idx, (kw, ri_id) in enumerate(batch_pairs):
await _scrape_one_keyword(kw, ri_id, is_first=(_kw_idx == 0))
if _kw_idx < len(batch_pairs) - 1:
jitter = _jitter(random.uniform(8, 20), pct=0.4)
await asyncio.sleep(jitter)
else:
# Parallel tab mode — all keywords start simultaneously,
# capped by max_tabs_per_site. No inter-keyword sleep
# because tabs are already staggered by network I/O.
tab_tasks = [
_scrape_one_keyword(kw, ri_id, is_first=(i == 0))
for i, (kw, ri_id) in enumerate(batch_pairs)
]
await asyncio.gather(*tab_tasks, return_exceptions=True)
# Close the context (both modes); close browser only if not persistent
await context.close()
if browser is not None:
await browser.close()
except Exception as browser_exc:
print(f"[Thread B] Browser error on {site.name}: {browser_exc}")
# ── Launch all site coroutines — sequential or parallel ───────────
if max_concurrent_browsers == 1:
print(f"[Thread B] 🔁 Sequential mode (max_concurrent_browsers=1)")
for site in target_sites:
await _scrape_one_site(site)
else:
capped = min(max_concurrent_browsers, len(target_sites))
print(
f"[Thread B] ⚡ Parallel mode — {capped} browser(s) × "
f"{max_tabs_per_site} tab(s)/site"
)
await asyncio.gather(
*[_scrape_one_site(site) for site in target_sites],
return_exceptions=True,
)
# If keyword batching is enabled, finish the active round once
# every (site, keyword) has become done/failed.
if keyword_batch_enabled and active_round_id is not None:
db_check = SessionLocal()
try:
remaining = (
db_check.query(ScrapeRoundItem)
.filter(
ScrapeRoundItem.round_id == active_round_id,
ScrapeRoundItem.status.in_(["pending", "in_progress"]),
)
.count()
)
if remaining == 0:
r = db_check.query(ScrapeRound).filter(ScrapeRound.id == active_round_id).first()
if r and r.status == "active":
r.status = "finished"
r.finished_at = datetime.now()
db_check.flush()
db_check.commit()
finally:
db_check.close()
# Hourly warning bookkeeping for pending keyword retries.
# Only items that have already had at least one failed attempt
# (attempt_count > 0) are eligible for hourly warnings.
if keyword_batch_enabled and active_round_id is not None:
db_warn = SessionLocal()
try:
r = db_warn.query(ScrapeRound).filter(ScrapeRound.id == active_round_id).first()
now_warn = datetime.now()
if r:
updated = 0
pending_items = (
db_warn.query(ScrapeRoundItem)
.filter(
ScrapeRoundItem.round_id == active_round_id,
ScrapeRoundItem.status == "pending",
)
.all()
)
for ri in pending_items:
if not (ri.attempt_count or 0) > 0:
continue
base = ri.last_hour_warn_at or ri.first_pending_at or r.started_at
if not base:
continue
if (now_warn - base).total_seconds() >= 3600:
ri.last_hour_warn_at = now_warn
updated += 1
if updated:
db_warn.flush()
db_warn.commit()
finally:
db_warn.close()
_stats["last_cycle"] = cycle_start
_stats["engine_status"] = "Idle — waiting next cycle"
_redis_set_stats(_stats)
_redis_publish("cycle_complete", {"last_cycle": cycle_start})
# ── N8: Boost mode — shorten interval when a lot closes soon ──────
# Check if any tracked lot closes within 30 min. If so, use
# boost_interval_mins instead of the normal timer so the engine
# refreshes more often during the critical closing window.
_boost_secs = int(_get_config("boost_interval_mins", "2")) * 60
_db_boost = SessionLocal()
try:
from datetime import timedelta
_soon_cutoff = datetime.now() + timedelta(minutes=30)
_closing_soon = _db_boost.query(Listing).filter(
Listing.time_left_mins != None,
Listing.time_left_mins <= 30,
Listing.time_left_mins > 0,
).count()
finally:
_db_boost.close()
_effective_timer = _boost_secs if _closing_soon else timer_val
_boost_label = f" [⚡ BOOST MODE — {_closing_soon} lot(s) closing soon]" if _closing_soon else ""
# Apply ±25% jitter to the cycle timer so requests never arrive
# at a perfectly predictable interval (a classic bot signature).
_sleep_actual = _jitter(_effective_timer, pct=0.25)
print(f"[Thread B] ✅ Cycle complete. Sleeping {int(_sleep_actual)}s (timer={_effective_timer}s ±25%).{_boost_label}")
# Poll every 5 s so that any API write (new keyword, site, config
# change) sets _cycle_now and the engine wakes up immediately
# instead of waiting the full inter-cycle sleep.
_slept = 0.0
_poll_interval = 5.0
while _slept < _sleep_actual:
if _cycle_now.is_set():
_cycle_now.clear()
print("[Thread B] ⚡ Change detected — skipping sleep, starting new cycle now.")
break
_chunk = min(_poll_interval, _sleep_actual - _slept)
await asyncio.sleep(_chunk)
_slept += _chunk
async def _price_refresh_pass() -> None:
"""
Single price-refresh pass — visits every saved lot page, pulls current
price + time-left, writes changes to DB.
Runs in its OWN dedicated asyncio event loop (Thread D) so it is
completely isolated from the main scraper loop (Thread B).
The two loops never share an event loop, never block each other, and
never compete for the same browser instance. SQLite handles concurrent
DB writes via its WAL journal — each function uses its own SessionLocal.
"""
db = SessionLocal()
try:
listings = db.query(Listing).filter(Listing.link.notlike("no-link-%")).all()
except Exception as exc:
print(f"[Refresh] ❌ DB read failed: {exc}")
db.close()
return
if not listings:
db.close()
return
print(f"[Refresh] 🔄 Starting price pass — {len(listings)} lot(s)…")
updated = 0
try:
async with async_playwright() as pw:
browser = await pw.chromium.launch(
headless=True,
args=[
"--no-sandbox",
"--disable-renderer-backgrounding",
"--disable-background-timer-throttling",
"--disable-backgrounding-occluded-windows",
],
)
context = await browser.new_context(
user_agent=random.choice(_rotating_agents),
locale="en-GB",
)
await context.add_init_script(
"Object.defineProperty(navigator,'webdriver',{get:()=>undefined});"
"document.hasFocus=()=>true;"
)
for listing in listings:
try:
page = await context.new_page()
await page.route(
"**/*.{png,jpg,jpeg,gif,svg,woff,woff2,ttf,mp4,webp}",
lambda route: route.abort(),
)
await page.goto(
listing.link, timeout=30_000, wait_until="domcontentloaded"
)
# ── Smart wait: Apollo cache polling (HiBid & GraphQL SPAs) ───────
_lot_id_m = re.search(r'/(?:lot|item|product)/(\d+)', listing.link, re.IGNORECASE)
_lot_id = _lot_id_m.group(1) if _lot_id_m else None
try:
await page.wait_for_function(
JS_APOLLO_WAIT, arg=_lot_id, timeout=8000, polling=200
)
except Exception:
pass
await page.wait_for_timeout(1500)
# Pull price + time from the detail page
data = await page.evaluate(r"""() => {
const PRICE_SELS = [
'[class*="current-bid"] [class*="amount"]',
'[class*="current-bid"]', '[class*="bid-amount"]',
'.s-item__price', '[itemprop="price"]',
'span[class*="price"]', '.price', '[class*="price"]',
];
const TIME_SELS = [
'[class*="time-left"]', '[class*="timeleft"]',
'[class*="countdown"]', '[class*="closing-time"]',
'[class*="time-remaining"]', '[class*="ends-in"]',
'.s-item__time-left', '[class*="expire"]',
'[class*="end-time"]', 'time',
];
const q = sels => { for (const s of sels) {
try { const el = document.querySelector(s); if (el) return el; }
catch(e) {} } return null; };
const pe = q(PRICE_SELS), te = q(TIME_SELS);
let pt = pe ? (
pe.innerText ||
pe.getAttribute('data-price') ||
pe.getAttribute('content') || ''
).trim() : '';
if (pt.includes('\n')) {
const ln = pt.split('\n').find(x => /\d/.test(x));
if (ln) pt = ln.trim();
}
return { price_text: pt, time_text: te ? te.innerText.trim() : '' };
}""")
# Pull images via shared 5-layer extractor (same as initial scrape)
img_urls = await page.evaluate(JS_DETAIL_IMAGES)
await page.close()
price_text = (data.get("price_text") or "").strip()
time_text = (data.get("time_text") or "").strip()
# img_urls already set by JS_DETAIL_IMAGES evaluate above
if not price_text and not time_text and not img_urls:
continue
amount, currency = _extract_price_and_currency(price_text)
price_display = _format_price(amount, currency)
time_left_str = _extract_time_left(time_text)
tl_mins = round(timeLeftToMins(time_left_str), 4) if time_left_str and timeLeftToMins(time_left_str) != float('inf') else None
changed = False
if amount is not None and amount != listing.price:
listing.price = amount
listing.currency = currency[:10] if currency else ""
listing.price_raw = price_display[:100]
changed = True
if time_left_str and time_left_str != listing.time_left:
listing.time_left = time_left_str[:60]
listing.time_left_mins = tl_mins
changed = True
# Update images whenever the URL set differs from what we stored —
# handles count changes (0→5, 1→5) AND quality upgrades where count
# stays the same but URLs differ (thumbnail→full-size).
# Guard: never overwrite a good set with an empty result.
if img_urls:
existing_imgs = []
try: existing_imgs = json.loads(listing.images or "[]")
except Exception: pass
if img_urls != existing_imgs:
listing.images = json.dumps(img_urls[:10])
changed = True
if changed:
listing.price_updated_at = datetime.now()
db.commit()
updated += 1
print(
f"[Refresh] ✅ {listing.title[:35]}"
f"{price_display} | {time_left_str}"
)
except Exception as lot_exc:
print(f"[Refresh] ⚠️ {listing.link[:55]}: {lot_exc}")
continue
await browser.close()
except Exception as exc:
print(f"[Refresh] ❌ Browser error: {exc}")
finally:
db.close()
print(f"[Refresh] ✅ Pass done — {updated}/{len(listings)} updated.")
def run_scraper_thread() -> None:
"""Thread B — main scraper + Telegram C2. Never touches price refresh."""
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
loop.run_until_complete(
asyncio.gather(nuclear_engine(), telegram_c2_loop())
)
def run_refresh_thread() -> None:
"""
Thread D — price/time-left refresh, completely isolated from Thread B.
Runs its own asyncio event loop so it never competes with the scraper
for the event loop, browser instances, or DB connections.
SQLite WAL mode handles concurrent writes from both threads safely.
Wakes every 5 minutes, runs a full refresh pass, then sleeps again.
If the pass takes longer than 5 min (large listing table) the next
pass starts immediately after — no overlap possible.
"""
import asyncio as _aio
async def _loop():
INTERVAL = 300 # 5 minutes
print("[Thread D] 💰 Price-refresh thread online.")
while True:
await _aio.sleep(INTERVAL)
try:
await _price_refresh_pass()
except Exception as exc:
print(f"[Thread D] ❌ Unhandled error in refresh pass: {exc}")
loop = _aio.new_event_loop()
_aio.set_event_loop(loop)
loop.run_until_complete(_loop())
# ─────────────────────────────────────────────────────────────────────────────
# Thread A — FastAPI Dashboard
# ─────────────────────────────────────────────────────────────────────────────
app = FastAPI(title="Ghost Node", version="1.0.0")
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_methods=["*"],
allow_headers=["*"],
)
# ── Static files / Dashboard ─────────────────────────────────────────────────
DASHBOARD_PATH = os.path.join(os.path.dirname(__file__), "dashboard.html")
@app.get("/", response_class=HTMLResponse)
async def serve_dashboard():
# Prefer Next.js static build when it exists; fall back to legacy dashboard.html
_next_index = os.path.join(os.path.dirname(__file__), "frontend", "out", "index.html")
if os.path.exists(_next_index):
with open(_next_index, "r", encoding="utf-8") as f:
return HTMLResponse(content=f.read())
if os.path.exists(DASHBOARD_PATH):
with open(DASHBOARD_PATH, "r", encoding="utf-8") as f:
return HTMLResponse(content=f.read())
return HTMLResponse("<h1>Dashboard not found</h1>", status_code=404)
@app.get("/legacy", response_class=HTMLResponse)
async def serve_legacy():
"""Always serves the original dashboard.html regardless of Next.js build."""
if os.path.exists(DASHBOARD_PATH):
with open(DASHBOARD_PATH, "r", encoding="utf-8") as f:
return HTMLResponse(content=f.read())
return HTMLResponse("<h1>Legacy dashboard not found</h1>", status_code=404)
# ── Stats ────────────────────────────────────────────────────────────────────
@app.get("/api/stats")
def get_stats():
uptime = int(time.time() - _stats["uptime_start"])
return {**_stats, "uptime_seconds": uptime}
# ── Listings ─────────────────────────────────────────────────────────────────
@app.get("/api/listings")
def get_listings(limit: int = 100, db: Session = Depends(get_db)):
rows = (
db.query(Listing)
.order_by(Listing.timestamp.desc())
.limit(limit)
.all()
)
return [r.to_dict() for r in rows]
@app.delete("/api/listings/{listing_id}")
def delete_listing(listing_id: int, db: Session = Depends(get_db)):
row = db.query(Listing).filter(Listing.id == listing_id).first()
if not row:
return JSONResponse({"error": "not found"}, status_code=404)
db.delete(row)
db.commit()
return {"status": "deleted"}
@app.delete("/api/listings")
def clear_listings(db: Session = Depends(get_db)):
db.query(Listing).delete()
db.commit()
return {"status": "cleared"}
@app.get("/api/listings/countdown-sync")
def countdown_sync(db: Session = Depends(get_db)):
"""
Lightweight endpoint polled every 60s by the dashboard countdown ticker.
Returns only the fields needed to keep the live countdown accurate:
- id
- time_left_mins (float, updated by price-refresh Thread D)
- price_updated_at (ISO string — the reference time time_left_mins was measured)
- timestamp (fallback reference if price_updated_at is null)
Much cheaper than /api/listings since it skips title/price/link/score/etc.
The frontend uses this to silently patch data-tlmins and data-captured on
each .tl-cell without triggering a full table re-render.
"""
rows = db.query(
Listing.id,
Listing.time_left_mins,
Listing.price_updated_at,
Listing.timestamp,
).all()
return [
{
"id": r.id,
"time_left_mins": r.time_left_mins,
"price_updated_at": r.price_updated_at.isoformat() if r.price_updated_at else None,
"timestamp": r.timestamp.isoformat() if r.timestamp else None,
}
for r in rows
]
@app.get("/api/listings/refresh-status")
def get_refresh_status(db: Session = Depends(get_db)):
"""
Returns the most recent price_updated_at across all listings.
The dashboard polls this every 30s and re-fetches /api/listings
when the timestamp changes — so price updates appear automatically.
"""
from sqlalchemy import func as sqlfunc
latest = db.query(sqlfunc.max(Listing.price_updated_at)).scalar()
return {
"last_price_update": latest.isoformat() if latest else None,
"listing_count": db.query(Listing).count(),
}
# ── Keywords ─────────────────────────────────────────────────────────────────
@app.get("/api/keywords")
def get_keywords(db: Session = Depends(get_db)):
return [k.to_dict() for k in db.query(Keyword).order_by(Keyword.sort_order.asc(), Keyword.id.asc()).all()]
@app.post("/api/keywords")
async def add_keyword(request: Request, db: Session = Depends(get_db)):
body = await request.json()
term = str(body.get("term", "")).strip()
weight = int(body.get("weight", 1))
if not term:
return JSONResponse({"error": "term required"}, status_code=400)
existing = db.query(Keyword).filter(Keyword.term == term).first()
if existing:
return JSONResponse({"error": "duplicate"}, status_code=409)
# assign sort_order = max + 1 so new keyword goes to the bottom
max_order = db.query(Keyword).count()
kw = Keyword(term=term, weight=weight, sort_order=max_order)
db.add(kw)
db.commit()
db.refresh(kw)
_cycle_now.set() # wake scraper immediately
return kw.to_dict()
@app.put("/api/keywords/{kw_id}")
async def update_keyword(kw_id: int, request: Request, db: Session = Depends(get_db)):
"""Update keyword term, weight, ai_target, min_price, max_price, and/or sort_order."""
row = db.query(Keyword).filter(Keyword.id == kw_id).first()
if not row:
return JSONResponse({"error": "not found"}, status_code=404)
body = await request.json()
if "term" in body:
new_term = str(body["term"]).strip()
if new_term and new_term != row.term:
conflict = db.query(Keyword).filter(Keyword.term == new_term, Keyword.id != kw_id).first()
if conflict:
return JSONResponse({"error": "duplicate term"}, status_code=409)
row.term = new_term
if "weight" in body:
row.weight = max(1, int(body["weight"] or 1))
if "ai_target" in body:
row.ai_target = str(body["ai_target"]).strip() or None
if "min_price" in body:
v = body["min_price"]
row.min_price = float(v) if v not in (None, "", "null") else None
if "max_price" in body:
v = body["max_price"]
row.max_price = float(v) if v not in (None, "", "null") else None
if "sort_order" in body:
row.sort_order = int(body["sort_order"])
db.flush()
db.commit()
db.refresh(row)
_cycle_now.set() # wake scraper immediately
return row.to_dict()
@app.post("/api/keywords/reorder")
async def reorder_keywords(request: Request, db: Session = Depends(get_db)):
"""Accepts {order: [id, id, ...]} and bulk-updates sort_order."""
body = await request.json()
ids = body.get("order", [])
for idx, kw_id in enumerate(ids):
db.query(Keyword).filter(Keyword.id == kw_id).update({"sort_order": idx})
db.flush()
db.commit()
return {"status": "reordered"}
@app.delete("/api/keywords/{kw_id}")
def delete_keyword(kw_id: int, db: Session = Depends(get_db)):
row = db.query(Keyword).filter(Keyword.id == kw_id).first()
if not row:
return JSONResponse({"error": "not found"}, status_code=404)
db.delete(row)
db.commit()
_cycle_now.set() # wake scraper immediately
return {"status": "deleted"}
# ── N6: Scoring Rules ─────────────────────────────────────────────────────────
@app.get("/api/scoring-rules")
def get_scoring_rules(db: Session = Depends(get_db)):
return [r.to_dict() for r in db.query(ScoringRule).order_by(ScoringRule.id.asc()).all()]
@app.post("/api/scoring-rules")
async def create_scoring_rule(request: Request, db: Session = Depends(get_db)):
body = await request.json()
signal = (body.get("signal") or "").strip()
delta = body.get("delta")
if not signal or delta is None:
return JSONResponse({"error": "signal and delta required"}, status_code=400)
if db.query(ScoringRule).filter(ScoringRule.signal.ilike(signal)).first():
return JSONResponse({"error": "duplicate signal"}, status_code=409)
rule = ScoringRule(
signal=signal[:100],
delta=int(delta),
category="positive" if int(delta) > 0 else "negative",
notes=(body.get("notes") or "").strip() or None,
)
db.add(rule)
db.flush()
db.commit()
return rule.to_dict()
@app.put("/api/scoring-rules/{rule_id}")
async def update_scoring_rule(rule_id: int, request: Request, db: Session = Depends(get_db)):
row = db.query(ScoringRule).filter(ScoringRule.id == rule_id).first()
if not row:
return JSONResponse({"error": "not found"}, status_code=404)
body = await request.json()
if "signal" in body:
row.signal = body["signal"].strip()[:100]
if "delta" in body:
row.delta = int(body["delta"])
row.category = "positive" if row.delta > 0 else "negative"
if "notes" in body:
row.notes = (body["notes"] or "").strip() or None
db.flush()
db.commit()
return row.to_dict()
@app.delete("/api/scoring-rules/{rule_id}")
def delete_scoring_rule(rule_id: int, db: Session = Depends(get_db)):
row = db.query(ScoringRule).filter(ScoringRule.id == rule_id).first()
if not row:
return JSONResponse({"error": "not found"}, status_code=404)
db.delete(row)
db.commit()
return {"status": "deleted"}
# ── Target Sites ─────────────────────────────────────────────────────────────
@app.get("/api/sites")
def get_sites(db: Session = Depends(get_db)):
return [s.to_dict() for s in db.query(TargetSite).order_by(TargetSite.sort_order.asc(), TargetSite.id.asc()).all()]
@app.get("/api/sites/enabled-count")
def get_enabled_site_count(db: Session = Depends(get_db)):
"""Returns the number of currently enabled target sites.
Used by the Settings page to validate max_concurrent_browsers."""
count = db.query(TargetSite).filter(TargetSite.enabled == 1).count()
return {"count": count}
@app.get("/api/scrape/progress")
def get_scrape_progress(db: Session = Depends(get_db)):
"""
Live scrape-progress for keyword batching mode.
Returns the active round and the “pending retries that need attention”
(attempt_count>0 only), including hourly warning timing.
"""
keyword_batch_enabled = _get_config("keyword_batch_enabled", "false").lower() == "true"
active_round = (
db.query(ScrapeRound)
.filter(ScrapeRound.status == "active")
.order_by(ScrapeRound.started_at.desc())
.first()
)
if not active_round:
return {
"keyword_batch_enabled": keyword_batch_enabled,
"active_round": None,
"counts": {"pending": 0, "in_progress": 0, "done": 0, "failed": 0},
"pending_items": [],
}
round_id = active_round.id
now_dt = datetime.now()
deadline_at = active_round.started_at + timedelta(hours=4)
def _count_status(st: str) -> int:
return (
db.query(ScrapeRoundItem)
.filter(ScrapeRoundItem.round_id == round_id, ScrapeRoundItem.status == st)
.count()
)
counts = {
"pending": _count_status("pending"),
"in_progress": _count_status("in_progress"),
"done": _count_status("done"),
"failed": _count_status("failed"),
}
pending_items = (
db.query(ScrapeRoundItem)
.filter(ScrapeRoundItem.round_id == round_id, ScrapeRoundItem.status == "pending")
.order_by(ScrapeRoundItem.attempt_count.desc(), ScrapeRoundItem.last_attempt_at.desc())
.limit(50)
.all()
)
site_ids = list({ri.site_id for ri in pending_items})
keyword_ids = list({ri.keyword_id for ri in pending_items})
site_map = {s.id: s.name for s in db.query(TargetSite).filter(TargetSite.id.in_(site_ids)).all()} if site_ids else {}
kw_map = {k.id: k.term for k in db.query(Keyword).filter(Keyword.id.in_(keyword_ids)).all()} if keyword_ids else {}
items_out = []
for ri in pending_items:
# Queued-but-never-attempted items have attempt_count==0.
if (ri.attempt_count or 0) <= 0:
continue
base = ri.last_hour_warn_at or ri.first_pending_at or active_round.started_at
warn_due = bool(base and (now_dt - base).total_seconds() >= 3600)
items_out.append(
{
"round_item_id": ri.id,
"site_name": site_map.get(ri.site_id, str(ri.site_id)),
"keyword_term": kw_map.get(ri.keyword_id, str(ri.keyword_id)),
"attempt_count": ri.attempt_count or 0,
"first_pending_at": ri.first_pending_at.isoformat() if ri.first_pending_at else None,
"last_attempt_at": ri.last_attempt_at.isoformat() if ri.last_attempt_at else None,
"last_hour_warn_at": ri.last_hour_warn_at.isoformat() if ri.last_hour_warn_at else None,
"last_error": (ri.last_error or "")[:300] if ri.last_error else None,
"warn_due": warn_due,
}
)
return {
"keyword_batch_enabled": keyword_batch_enabled,
"active_round": {
"id": active_round.id,
"started_at": active_round.started_at.isoformat() if active_round.started_at else None,
"deadline_at": deadline_at.isoformat() if deadline_at else None,
},
"counts": counts,
"pending_items": items_out,
}
@app.post("/api/sites/reorder")
async def reorder_sites(request: Request, db: Session = Depends(get_db)):
"""Accepts {order: [id, id, ...]} and bulk-updates sort_order."""
body = await request.json()
ids = body.get("order", [])
for idx, site_id in enumerate(ids):
db.query(TargetSite).filter(TargetSite.id == site_id).update({"sort_order": idx})
db.flush()
db.commit()
return {"status": "reordered"}
@app.post("/api/sites")
async def add_site(request: Request, db: Session = Depends(get_db)):
"""
Registers a new TargetSite.
Uses request.json() directly (no Pydantic model) to prevent 400 errors.
Sets enabled=1 explicitly — never relies on column default under concurrent load.
Calls db.flush() before db.commit() to force the INSERT into the SQLite
WAL immediately, so the scraper thread's next DB session sees the new row.
URL mode is inferred automatically:
Mode A — Direct: url_template contains {keyword}
→ scraper substitutes keyword and navigates directly.
Mode B — Homepage: url_template has NO {keyword}
→ scraper navigates to the URL then types in the
search box identified by search_selector.
search_selector is required in this mode.
"""
try:
body = await request.json()
except Exception:
return JSONResponse({"error": "invalid JSON body"}, status_code=400)
name = str(body.get("name", "")).strip()
template = str(body.get("url_template", "")).strip()
selector = str(body.get("search_selector", "")).strip()
def _coerce_int_01(val, default: int = 0) -> int:
if val is None:
return default
if isinstance(val, bool):
return 1 if val else 0
s = str(val).strip().lower()
if s in ("1", "true", "yes", "on"):
return 1
if s in ("0", "false", "no", "off", ""):
return 0
try:
return 1 if int(float(val)) != 0 else 0
except Exception:
return default
if not name or not template:
return JSONResponse({"error": "name and url_template are required"}, status_code=400)
# ── Infer navigation mode and warn (not reject) for homepage mode ────────
is_direct_mode = "{keyword}" in template
if is_direct_mode:
mode_label = "DIRECT (keyword substitution)"
else:
mode_label = "HOMEPAGE (search-box interaction)"
if not selector:
# Warn but still save — operator can add selector via PUT later
print(
f"[API] ⚠️ Site '{name}' saved in HOMEPAGE mode but "
f"search_selector is empty. Add a CSS selector "
f"(e.g. 'input#st') via the Target Sites tab or the "
f"scraper will skip this site until one is provided."
)
max_pages = max(1, int(body.get("max_pages", 1) or 1))
requires_login = bool(body.get("requires_login", False))
login_url = str(body.get("login_url", "") or "").strip()
login_check = str(body.get("login_check_selector", "") or "").strip()
login_enabled = bool(body.get("login_enabled", requires_login))
custom_visible_browser = _coerce_int_01(body.get("custom_visible_browser", 0), default=0)
max_order = db.query(TargetSite).count() # new site goes to the bottom
site = TargetSite(
name=name,
url_template=template,
search_selector=selector,
enabled=1, # explicit — never rely on column default for critical flag
max_pages=max_pages,
sort_order=max_order,
requires_login=requires_login,
login_url=login_url,
login_check_selector=login_check,
login_enabled=login_enabled,
custom_visible_browser=custom_visible_browser,
)
db.add(site)
db.flush() # pushes INSERT to SQLite WAL before commit
db.commit()
db.refresh(site)
print(f"[API] ✅ New TargetSite saved: '{site.name}' id={site.id} "
f"mode={mode_label} pages={max_pages} login={requires_login}")
# Auto-adapt: if enabled, kick off AI selector generation immediately for new site
if _get_config("auto_adapt_enabled", "false").lower() == "true":
asyncio.create_task(adapt_site_now(site.id))
print(f"[AutoAdapt] 🆕 New site '{site.name}' — auto-adapt queued.")
_cycle_now.set() # wake scraper immediately
return site.to_dict()
@app.put("/api/sites/{site_id}")
async def update_site(site_id: int, request: Request, db: Session = Depends(get_db)):
"""
Updates a TargetSite row.
Coerces 'enabled' to a plain integer (1 or 0) regardless of whether the
dashboard sends a JSON boolean (true/false) or integer (1/0) — SQLite
stores INTEGER and the filter TargetSite.enabled == 1 must see an int.
db.flush() is called before db.commit() to push the UPDATE into the
SQLite WAL immediately, closing the race window where the scraper thread
could open a new session and read stale 'enabled' values.
"""
try:
body = await request.json()
except Exception:
return JSONResponse({"error": "invalid JSON body"}, status_code=400)
row = db.query(TargetSite).filter(TargetSite.id == site_id).first()
if not row:
return JSONResponse({"error": "not found"}, status_code=404)
for field in ("name", "url_template", "search_selector"):
if field in body:
setattr(row, field, body[field])
# ── Coerce 'enabled' to plain int so SQLite stores 1 or 0, never True/False ──
if "enabled" in body:
row.enabled = 1 if body["enabled"] else 0
def _coerce_int_01(val, default: int = 0) -> int:
if val is None:
return default
if isinstance(val, bool):
return 1 if val else 0
s = str(val).strip().lower()
if s in ("1", "true", "yes", "on"):
return 1
if s in ("0", "false", "no", "off", ""):
return 0
try:
return 1 if int(float(val)) != 0 else 0
except Exception:
return default
# ── max_pages and login fields ────────────────────────────────────────────
if "max_pages" in body:
row.max_pages = max(1, int(body["max_pages"] or 1))
for field in ("requires_login", "login_enabled"):
if field in body:
setattr(row, field, bool(body[field]))
for field in ("login_url", "login_check_selector"):
if field in body:
setattr(row, field, str(body[field] or "").strip())
if "custom_visible_browser" in body:
row.custom_visible_browser = _coerce_int_01(body.get("custom_visible_browser"), default=0)
db.flush() # ← pushes UPDATE to WAL; scraper thread sees it immediately
db.commit()
db.refresh(row)
status = "ENABLED ✅" if row.enabled == 1 else "DISABLED ⏸"
print(f"[API] ✅ Site '{row.name}' (id={site_id}) → {status}")
_cycle_now.set() # wake scraper immediately
return row.to_dict()
@app.delete("/api/sites/{site_id}")
def delete_site(site_id: int, db: Session = Depends(get_db)):
row = db.query(TargetSite).filter(TargetSite.id == site_id).first()
if not row:
return JSONResponse({"error": "not found"}, status_code=404)
db.delete(row)
db.commit()
_cycle_now.set() # wake scraper immediately
return {"status": "deleted"}
# ── Config / Settings ─────────────────────────────────────────────────────────
@app.get("/api/config")
def get_config(db: Session = Depends(get_db)):
rows = db.query(Config).all()
return {r.key: r.value for r in rows}
@app.post("/api/config")
async def save_config(request: Request, db: Session = Depends(get_db)):
"""
Accepts a JSON dict of key→value pairs and upserts into the Config table.
Uses request.json() directly to avoid Pydantic 400 errors.
db.flush() forces the UPDATEs/INSERTs into the SQLite WAL before commit,
ensuring the scraper thread's next _get_config() call sees fresh values.
"""
try:
body = await request.json()
except Exception:
return JSONResponse({"error": "invalid JSON body"}, status_code=400)
saved_keys: list[str] = []
for key, value in body.items():
row = db.query(Config).filter(Config.key == key).first()
if row:
row.value = str(value)
else:
db.add(Config(key=key, value=str(value)))
saved_keys.append(key)
db.flush() # push dirty rows to SQLite WAL
db.commit() # finalise the transaction on disk
# Terminal confirmation — proves the write happened
print(f"[API] ✅ Config saved to DB: {saved_keys}")
for k in saved_keys:
row = db.query(Config).filter(Config.key == k).first()
display = row.value[:6] + "" if row and row.value and len(row.value) > 6 else (row.value if row else "")
print(f" {k} = {display!r}")
_cycle_now.set() # wake scraper immediately
return {"status": "saved", "keys": saved_keys}
# ── N16: AI Test Endpoint ──────────────────────────────────────────────────────
@app.post("/api/ai/test")
async def ai_test(request: Request):
"""
Test the AI filter with a sample title and target.
Body: {"title": "...", "ai_target": "..."}
Returns: {"match": bool, "reason": "..."}
"""
try:
body = await request.json()
except Exception:
return JSONResponse({"error": "invalid JSON"}, status_code=400)
title = str(body.get("title", "")).strip()
ai_target = str(body.get("ai_target", "")).strip()
if not title or not ai_target:
return JSONResponse({"error": "title and ai_target required"}, status_code=400)
provider = _get_config("ai_provider", "groq").strip().lower()
if provider == "none":
return {"match": True, "reason": "AI provider is set to none — filter disabled."}
match, reason = await _ai_analyze(title, ai_target)
return {"match": match, "reason": reason, "provider": provider}
@app.get("/api/ai/debug/log")
def ai_debug_log(limit: int = 200, since_id: int = 0):
"""
Return the in-memory AI debug log (newest entries last).
- limit: max entries to return (default 200, max 300)
- since_id: only return entries with id > since_id (for polling — pass the last
id you received to get only new entries since then)
Requires ai_debug = true in config to produce entries; always returns the
current buffer regardless.
"""
with _ai_debug_log_lock:
entries = list(_ai_debug_log)
if since_id > 0:
entries = [e for e in entries if e.get("id", 0) > since_id]
entries = entries[-min(limit, 300):]
debug_on = _ai_debug_enabled()
return {
"debug_enabled": debug_on,
"total_in_buffer": len(list(_ai_debug_log)),
"entries": entries,
}
@app.delete("/api/ai/debug/log")
def ai_debug_log_clear():
"""Clear the in-memory AI debug log buffer."""
with _ai_debug_log_lock:
_ai_debug_log.clear()
return {"status": "ok", "message": "AI debug log cleared."}
# ── Engine Control ─────────────────────────────────────────────────────────────
@app.post("/api/engine/pause")
def engine_pause():
_stats["engine_status"] = "Paused"
_redis_set_stats(_stats)
_redis_publish("engine_status", {"status": "Paused"})
return {"status": "paused"}
@app.post("/api/engine/resume")
def engine_resume():
_stats["engine_status"] = "Running"
_redis_set_stats(_stats)
_redis_publish("engine_status", {"status": "Running"})
return {"status": "running"}
@app.post("/api/engine/restart")
def engine_restart():
"""
Cross-platform hard restart.
Strategy:
1. Respond HTTP 200 immediately so the client gets a clean response.
2. A daemon thread waits 1 second (lets uvicorn flush the response),
then spawns a brand-new Python process running the same script with
the same arguments via subprocess.Popen.
3. After spawning the child, the current process calls os._exit(0) to
terminate itself immediately and release port 3001.
Why not os.execv?
os.execv works on Linux but on Windows it does NOT replace the current
process — it creates a new one while the old one keeps running, which
causes an "address already in use" error on port 3001.
Why subprocess.Popen + os._exit(0)?
Popen detaches the child before the parent exits, so the child is
never left as an orphan. os._exit(0) bypasses Python's atexit hooks
and __del__ finalizers which can deadlock when uvicorn is still
running threads.
"""
import threading, subprocess
def _do_restart() -> None:
time.sleep(1.0) # give uvicorn time to flush the HTTP response
try:
print("[GhostNode] 🔄 Spawning new process…")
# Inherit stdout/stderr so the new process logs to the same terminal
subprocess.Popen(
[sys.executable] + sys.argv,
stdout=None,
stderr=None,
close_fds=True,
)
print("[GhostNode] ✅ New process launched — shutting down this instance.")
except Exception as exc:
print(f"[GhostNode] ❌ Restart failed: {exc}")
return
# Kill this process immediately — port 3001 is now free for the child
os._exit(0)
threading.Thread(target=_do_restart, daemon=True, name="GhostNode-Restart").start()
return {
"status": "restarting",
"message": "New process spawning — this instance will exit in ~1 second.",
}
@app.post("/api/engine/kill")
def engine_kill():
"""
Hard-kill Ghost Node immediately — no restart, no respawn.
Sends HTTP 200 first, then a daemon thread calls os._exit(0) after
a 300 ms flush window. The entire process dies: all threads, the
scraper, the Telegram C2 loop, and uvicorn.
The dashboard will go offline and will NOT reconnect automatically.
The user must restart manually from the terminal.
"""
def _do_kill() -> None:
time.sleep(0.3) # let uvicorn flush the response
print("[GhostNode] ☠ KILL signal received — terminating process.")
os._exit(0)
threading.Thread(target=_do_kill, daemon=True, name="GhostNode-Kill").start()
return {"status": "killed", "message": "Process terminating in ~300ms."}
# ── Telegram connectivity test ────────────────────────────────────────────────
@app.post("/api/telegram/test")
async def test_telegram():
"""
Sends a test message using whatever token/chat_id is currently in the DB.
Returns the full Telegram response body so you can diagnose 401/404 etc.
"""
token = _get_config("telegram_token")
chat_id = _get_config("telegram_chat_id")
if not token or not chat_id:
return JSONResponse(
{"ok": False, "error": "No token or chat_id saved in DB. Open Settings tab and save first."},
status_code=400,
)
url = f"https://api.telegram.org/bot{token}/sendMessage"
try:
async with httpx.AsyncClient(timeout=15) as client:
r = await client.post(
url,
data={"chat_id": chat_id, "text": "👻 Ghost Node — Telegram test OK!", "parse_mode": "HTML"},
)
body = r.json()
if r.status_code == 200:
return {"ok": True, "telegram_response": body}
else:
return JSONResponse(
{"ok": False, "http_status": r.status_code, "telegram_response": body},
status_code=200, # return 200 to JS — the Telegram error is in the body
)
except Exception as exc:
return JSONResponse({"ok": False, "error": str(exc)}, status_code=500)
# ── DB read-back diagnostic ───────────────────────────────────────────────────
# ── N14 — Login trigger endpoint ─────────────────────────────────────────────
@app.post("/api/sites/{site_id}/login")
async def trigger_login(site_id: int, db: Session = Depends(get_db)):
"""
Opens a VISIBLE browser window on the site's login_url so the user can
manually log in. The session is saved to the persistent profile for that
site and reused by the scraper on all future cycles.
Only works when login_enabled = true for this site.
Returns immediately — the browser window stays open for the user to log in.
"""
site = db.query(TargetSite).filter(TargetSite.id == site_id).first()
if not site:
return JSONResponse({"error": "site not found"}, status_code=404)
if not site.login_enabled:
return JSONResponse({"error": "login_enabled is false for this site"}, status_code=400)
if not site.login_url:
return JSONResponse({"error": "No login_url configured for this site"}, status_code=400)
import re as _re2
site_slug = _re2.sub(r"[^a-z0-9]", "_", site.name.lower())[:40]
profile_dir = os.path.join(os.path.dirname(__file__), ".browser_profiles", site_slug)
os.makedirs(profile_dir, exist_ok=True)
async def _open_login_browser():
from playwright.async_api import async_playwright
async with async_playwright() as pw:
try:
_lbl, _exe = _resolve_browser()
ctx = await pw.chromium.launch_persistent_context(
profile_dir,
executable_path=_exe or None,
headless=False, # MUST be visible so user can log in
args=["--no-sandbox"],
)
page = await ctx.new_page()
await page.goto(site.login_url, timeout=60_000, wait_until="domcontentloaded")
print(f"[Login] 🔑 Browser open for {site.name} — log in and close when done.")
# Wait up to 10 minutes for the user to log in and close
await ctx.wait_for_event("close", timeout=600_000)
print(f"[Login] ✅ Session saved for {site.name}.")
except Exception as exc:
print(f"[Login] ❌ {exc}")
# Run in background — don't block the API response
asyncio.create_task(_open_login_browser())
return {
"status": "browser_opening",
"message": f"A visible browser window is opening for {site.name}. Log in and close it when done — the session will be saved automatically.",
"login_url": site.login_url,
"profile_dir": profile_dir,
}
# ── N17 — Auto-Adapter endpoints ─────────────────────────────────────────────
@app.post("/api/sites/{site_id}/adapt")
async def trigger_adapt(site_id: int, db: Session = Depends(get_db)):
"""
Trigger AI selector generation for a site.
Launches a temporary browser, scrapes the site, sends cleaned HTML to
Groq (online) or Ollama (local) for CSS selector generation, validates
live, and persists to the site_selectors table.
Returns immediately with a task status; full result logged to console.
"""
site = db.query(TargetSite).filter(TargetSite.id == site_id).first()
if not site:
return JSONResponse({"error": "site not found"}, status_code=404)
cfg = {r.key: r.value for r in db.query(Config).all()}
provider = cfg.get("ai_provider", "groq")
if provider == "none":
return JSONResponse({"error": "AI provider is set to 'none'. Enable Groq or Ollama in Settings."}, status_code=400)
# NOTE: auto_adapt_enabled only gates *automatic* adaptation on new site creation.
# Manual adaptation via the 🤖 ADAPT button is always permitted if an AI provider
# is configured — do NOT gate it on auto_adapt_enabled.
async def _run():
result = await adapt_site_now(site_id)
confidence = result.get("confidence", 0)
status = "" if confidence >= 50 else "⚠️"
print(f"[AutoAdapt] {status} Manual adapt for {site.name} done — confidence {confidence:.1f}")
asyncio.create_task(_run())
return {
"status": "adapting",
"message": f"AI selector generation started for '{site.name}'. Check console for progress. Reload the Sites tab in ~30s to see the result.",
"site_id": site_id,
"provider": provider,
}
@app.get("/api/sites/{site_id}/selectors")
def get_site_selectors(site_id: int, db: Session = Depends(get_db)):
"""Return the stored AI selectors for a site, if any."""
site = db.query(TargetSite).filter(TargetSite.id == site_id).first()
if not site:
return JSONResponse({"error": "site not found"}, status_code=404)
ss = db.query(SiteSelectors).filter(SiteSelectors.site_id == site_id).first()
if not ss:
return {"site_id": site_id, "site_name": site.name, "selectors": None}
return {"site_id": site_id, "site_name": site.name, "selectors": ss.to_dict()}
@app.delete("/api/sites/{site_id}/selectors")
def delete_site_selectors(site_id: int, db: Session = Depends(get_db)):
"""Delete stored AI selectors for a site (forces re-adaptation on next cycle)."""
site = db.query(TargetSite).filter(TargetSite.id == site_id).first()
if not site:
return JSONResponse({"error": "site not found"}, status_code=404)
ss = db.query(SiteSelectors).filter(SiteSelectors.site_id == site_id).first()
if not ss:
return {"status": "ok", "message": "No selectors stored for this site."}
db.flush()
db.delete(ss)
db.commit()
return {"status": "ok", "message": f"Selectors for '{site.name}' deleted. Site will use universal extractor until re-adapted."}
# ── N15 — Export endpoints ────────────────────────────────────────────────────
import csv, json as _json
from io import StringIO
from fastapi.responses import StreamingResponse
@app.get("/api/export/csv")
def export_csv(limit: int = 10000, db: Session = Depends(get_db)):
"""Export all listings to a CSV file download."""
rows = db.query(Listing).order_by(Listing.timestamp.desc()).limit(limit).all()
output = StringIO()
writer = csv.writer(output)
writer.writerow(["ID","Title","Price","Currency","Price Raw","Time Left","Score",
"Keyword","Site","Link","Captured At","Price Updated At"])
for r in rows:
writer.writerow([
r.id, r.title, r.price or "", r.currency or "", r.price_raw or "",
r.time_left or "", r.score, r.keyword or "", r.site_name or "",
r.link, r.timestamp.isoformat() if r.timestamp else "",
r.price_updated_at.isoformat() if r.price_updated_at else "",
])
output.seek(0)
return StreamingResponse(
iter([output.getvalue()]),
media_type="text/csv",
headers={"Content-Disposition": "attachment; filename=ghost_node_listings.csv"},
)
@app.get("/api/export/json")
def export_json(limit: int = 10000, db: Session = Depends(get_db)):
"""Export all listings to a JSON file download."""
rows = db.query(Listing).order_by(Listing.timestamp.desc()).limit(limit).all()
data = _json.dumps([r.to_dict() for r in rows], indent=2, default=str)
return StreamingResponse(
iter([data]),
media_type="application/json",
headers={"Content-Disposition": "attachment; filename=ghost_node_listings.json"},
)
@app.get("/api/export/html")
def export_html(limit: int = 10000, db: Session = Depends(get_db)):
"""Export all listings as a self-contained HTML report."""
rows = db.query(Listing).order_by(Listing.timestamp.desc()).limit(limit).all()
now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
rows_html = ""
for r in rows:
score_color = "#00ff88" if r.score > 0 else "#888"
ts_str = r.timestamp.strftime("%Y-%m-%d %H:%M") if r.timestamp else ""
rows_html += (
"<tr>"
f'<td><a href="{r.link}" target="_blank">{r.title[:80]}</a></td>'
f'<td>{r.price_raw or ""}</td>'
f'<td>{r.time_left or ""}</td>'
f'<td style="color:{score_color}">{r.score}</td>'
f'<td>{r.keyword or ""}</td>'
f'<td>{r.site_name or ""}</td>'
f'<td style="font-size:11px;color:#888">{ts_str}</td>'
"</tr>\n"
)
html = f"""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Ghost Node Export — {now}</title>
<style>
body{{font-family:monospace;background:#0d0d1a;color:#c8d3f0;padding:24px}}
h1{{color:#00f5ff;letter-spacing:3px}}
table{{width:100%;border-collapse:collapse;margin-top:16px}}
th{{background:#1a1a2e;color:#00f5ff;padding:8px 12px;text-align:left;border-bottom:1px solid #333}}
td{{padding:6px 12px;border-bottom:1px solid #1a1a2e;font-size:13px}}
tr:hover{{background:#1a1a2e}}
a{{color:#00f5ff;text-decoration:none}}
a:hover{{text-decoration:underline}}
.meta{{color:#888;font-size:12px;margin-bottom:12px}}
</style>
</head>
<body>
<h1>// GHOST NODE LISTINGS EXPORT</h1>
<div class="meta">Generated: {now} | {len(rows)} listings</div>
<table>
<thead><tr><th>Title</th><th>Price</th><th>Time Left</th><th>Score</th><th>Keyword</th><th>Site</th><th>Captured</th></tr></thead>
<tbody>
{rows_html}
</tbody>
</table>
</body>
</html>"""
return StreamingResponse(
iter([html]),
media_type="text/html",
headers={"Content-Disposition": f"attachment; filename=ghost_node_export_{now[:10]}.html"},
)
# ── Database Backup & Restore ────────────────────────────────────────────────
@app.get("/api/backup/download")
def backup_download():
"""
Stream the raw sniper.db SQLite file as a download.
Only works when using SQLite (not PostgreSQL).
Creates a timestamped filename so backups don't overwrite each other.
"""
from database import DATABASE_URL, _is_sqlite
if not _is_sqlite:
return JSONResponse(
{"error": "Backup only supported for SQLite. Use pg_dump for PostgreSQL."},
status_code=400,
)
# Resolve the actual file path from the SQLite URL
db_path = DATABASE_URL.replace("sqlite:///", "").replace("sqlite://", "")
if not db_path.startswith("/"):
db_path = os.path.join(os.path.dirname(__file__), db_path.lstrip("./"))
db_path = os.path.abspath(db_path)
if not os.path.exists(db_path):
return JSONResponse({"error": f"Database file not found: {db_path}"}, status_code=404)
# Use a safe hot-backup: VACUUM INTO a temp file, then stream it
# This avoids streaming a live WAL-mode DB mid-write
import tempfile, shutil
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
backup_filename = f"ghost_node_backup_{ts}.db"
tmp_path = os.path.join(tempfile.gettempdir(), backup_filename)
try:
import sqlite3 as _sqlite3
conn = _sqlite3.connect(db_path)
bk = _sqlite3.connect(tmp_path)
conn.backup(bk)
bk.close()
conn.close()
except Exception as exc:
return JSONResponse({"error": f"Backup failed: {exc}"}, status_code=500)
def _stream_and_delete():
try:
with open(tmp_path, "rb") as f:
while chunk := f.read(65536):
yield chunk
finally:
try:
os.remove(tmp_path)
except Exception:
pass
print(f"[Backup] 📦 Streaming backup: {backup_filename} ({os.path.getsize(tmp_path):,} bytes)")
return StreamingResponse(
_stream_and_delete(),
media_type="application/octet-stream",
headers={"Content-Disposition": f"attachment; filename={backup_filename}"},
)
@app.post("/api/backup/restore")
async def backup_restore(request: Request):
"""
Accept a .db file upload and replace the current sniper.db with it.
The server restarts automatically after restore so all connections reopen.
SAFETY: saves the current DB as an auto-backup before overwriting.
Only works when using SQLite.
"""
from database import DATABASE_URL, _is_sqlite
if not _is_sqlite:
return JSONResponse(
{"error": "Restore only supported for SQLite."},
status_code=400,
)
db_path = DATABASE_URL.replace("sqlite:///", "").replace("sqlite://", "")
if not db_path.startswith("/"):
db_path = os.path.join(os.path.dirname(__file__), db_path.lstrip("./"))
db_path = os.path.abspath(db_path)
try:
body = await request.body()
if len(body) < 100:
return JSONResponse({"error": "Uploaded file appears empty or too small."}, status_code=400)
# Verify it's a valid SQLite file (magic bytes: "SQLite format 3")
if not body[:16].startswith(b"SQLite format 3"):
return JSONResponse(
{"error": "File does not appear to be a valid SQLite database."},
status_code=400,
)
# Auto-backup current DB before overwriting
if os.path.exists(db_path):
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
auto_bk = db_path + f".pre_restore_{ts}.bak"
import shutil
shutil.copy2(db_path, auto_bk)
print(f"[Restore] 💾 Auto-backup saved: {auto_bk}")
# Write the uploaded file
with open(db_path, "wb") as f:
f.write(body)
print(f"[Restore] ✅ Database restored from upload ({len(body):,} bytes). Restarting...")
# Restart the process to reopen all DB connections
import threading
def _restart():
import time, sys
time.sleep(1)
os.execv(sys.executable, [sys.executable] + sys.argv)
threading.Thread(target=_restart, daemon=True).start()
return JSONResponse({
"status": "restored",
"message": "Database restored successfully. Ghost Node is restarting — refresh the dashboard in 5 seconds.",
"bytes_written": len(body),
})
except Exception as exc:
return JSONResponse({"error": f"Restore failed: {exc}"}, status_code=500)
@app.get("/api/redis/status")
def redis_status():
"""Check Redis connectivity and return cached stats hash."""
if _redis_client is None:
return {"connected": False, "reason": "REDIS_URL not set or redis package missing"}
try:
_redis_client.ping()
cached = _redis_client.hgetall(_REDIS_STATS_KEY)
return {"connected": True, "url": os.environ.get("REDIS_URL", ""), "cached_stats": cached}
except Exception as exc:
return {"connected": False, "reason": str(exc)}
@app.get("/api/debug/db")
def debug_db(db: Session = Depends(get_db)):
"""
Returns the exact contents of Config and TargetSite tables.
Use this to confirm that Settings-tab saves and new sites
are genuinely written to sniper.db.
"""
configs = {r.key: r.value for r in db.query(Config).all()}
# Mask token for security — show only first 8 chars
if "telegram_token" in configs and configs["telegram_token"]:
t = configs["telegram_token"]
configs["telegram_token"] = t[:8] + "" if len(t) > 8 else t
sites = [s.to_dict() for s in db.query(TargetSite).all()]
keywords = [k.to_dict() for k in db.query(Keyword).all()]
return {
"config": configs,
"sites": sites,
"keywords": keywords,
"listing_count": db.query(Listing).count(),
}
# ── Phase 7: Serve Next.js static build ─────────────────────────────────────
import pathlib as _pathlib
_frontend_out = _pathlib.Path(__file__).parent / "frontend" / "out"
if _frontend_out.exists():
from fastapi.staticfiles import StaticFiles
from fastapi.responses import FileResponse as _FileResponse
# Mount ONLY the _next directory (JS/CSS/image assets).
# We deliberately avoid app.mount("/", html=True) because it intercepts
# ALL paths as a Starlette sub-app, shadowing explicit routes like /legacy.
_next_dir = _frontend_out / "_next"
if _next_dir.exists():
app.mount("/_next", StaticFiles(directory=str(_next_dir)), name="nextjs_assets")
# SPA catch-all — registered last so all specific @app.get() routes win.
# Handles: exact files (favicon.ico, etc.), Next.js .html pages, and the
# SPA index.html fallback for deep-linked client-side routes.
@app.get("/{full_path:path}")
async def serve_spa(full_path: str):
# 1. Exact file match (favicon.ico, *.svg, etc.)
candidate = _frontend_out / full_path
if candidate.is_file():
return _FileResponse(str(candidate))
# 2. Next.js exported page (e.g. "dashboard" → dashboard.html)
html_candidate = _frontend_out / f"{full_path}.html"
if html_candidate.is_file():
return _FileResponse(str(html_candidate))
# 3. SPA fallback — let the client-side router handle it
return _FileResponse(str(_frontend_out / "index.html"))
print("[GhostNode] Serving Next.js frontend from frontend/out/")
# ─────────────────────────────────────────────────────────────────────────────
# Entry Point — spin up threads
# ─────────────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
# Thread B (scraper & Telegram C2 share one asyncio event loop)
scraper_thread = threading.Thread(
target=run_scraper_thread,
name="GhostNode-Scraper",
daemon=True,
)
scraper_thread.start()
# Thread D (price/time-left refresh — isolated event loop, never blocks scraper)
refresh_thread = threading.Thread(
target=run_refresh_thread,
name="GhostNode-Refresh",
daemon=True,
)
refresh_thread.start()
# Thread E (closing-soon alert — isolated event loop, polls every 60s)
closing_thread = threading.Thread(
target=run_closing_alert_thread,
name="GhostNode-ClosingAlert",
daemon=True,
)
closing_thread.start()
print("[GhostNode] 🕵️ Ghost Node online — Dashboard → http://localhost:3001")
# Thread A (FastAPI via uvicorn — blocks main thread)
uvicorn.run(app, host="0.0.0.0", port=3001, log_level="warning")