5466 lines
249 KiB
Python
5466 lines
249 KiB
Python
"""
|
||
Ghost Node — Worker
|
||
Three-thread architecture:
|
||
Thread A → FastAPI dashboard (port 8000)
|
||
Thread B → Async Playwright scraper (nuclear_engine)
|
||
Thread C → Telegram C2 polling loop
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import sys
|
||
if hasattr(sys.stdout, 'reconfigure'):
|
||
sys.stdout.reconfigure(encoding='utf-8')
|
||
sys.stderr.reconfigure(encoding='utf-8')
|
||
|
||
import asyncio
|
||
import collections
|
||
import difflib
|
||
import json
|
||
import os
|
||
import platform
|
||
import random
|
||
import re
|
||
import sys
|
||
import threading
|
||
import time
|
||
from contextlib import asynccontextmanager
|
||
from datetime import datetime, timedelta
|
||
from typing import Any, Optional
|
||
|
||
import httpx
|
||
import uvicorn
|
||
from fastapi import Depends, FastAPI, Request
|
||
from fastapi.middleware.cors import CORSMiddleware
|
||
from fastapi.responses import FileResponse, HTMLResponse, JSONResponse
|
||
from fastapi.staticfiles import StaticFiles
|
||
from playwright.async_api import async_playwright
|
||
from sqlalchemy.orm import Session
|
||
|
||
from database import SessionLocal, get_db
|
||
from models import Config, Keyword, Listing, ScoringRule, SiteSelectors, TargetSite, calculate_attribute_score, seed_database
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Bootstrap
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
seed_database() # idempotent — only seeds if tables are empty
|
||
|
||
# ── Print active-site roster so the operator can verify disabled sites ────────
|
||
def _print_active_sites() -> None:
|
||
"""
|
||
Runs once at startup. Prints every TargetSite row with its enabled status
|
||
so the operator can immediately see which sites the scraper will visit and
|
||
confirm that any toggled-off sites are genuinely excluded.
|
||
"""
|
||
db = SessionLocal()
|
||
try:
|
||
all_sites = db.query(TargetSite).order_by(TargetSite.id).all()
|
||
print("\n[GhostNode] 📋 Target Site Roster:")
|
||
print( " ┌─────┬──────────┬───────────────────────────────────────────────────────────────┐")
|
||
for s in all_sites:
|
||
status = "✅ ACTIVE " if s.enabled == 1 else "⏸ DISABLED"
|
||
mode = "DIRECT " if "{keyword}" in s.url_template else "HOMEPAGE"
|
||
sel_str = f" sel={s.search_selector!r}" if s.search_selector else ""
|
||
print(f" │ {s.id:<3} │ {status} │ {s.name:<20} [{mode}] {s.url_template[:40]}…{sel_str}")
|
||
print( " └─────┴──────────┴───────────────────────────────────────────────────────────────┘\n")
|
||
finally:
|
||
db.close()
|
||
|
||
_print_active_sites()
|
||
|
||
# ── AI Debug Log ─────────────────────────────────────────────────────────────
|
||
# Circular buffer holding the last 300 AI call records.
|
||
# Written by _ai_log_entry(); read by GET /api/ai/debug/log.
|
||
# Active whenever ai_debug = true in config.
|
||
_ai_debug_log: collections.deque = collections.deque(maxlen=300)
|
||
_ai_debug_log_id: int = 0 # monotonic counter for ordering
|
||
_ai_debug_log_lock = threading.Lock()
|
||
|
||
def _ai_log_entry(entry: dict) -> None:
|
||
"""Append one record to the in-memory AI debug ring buffer."""
|
||
global _ai_debug_log_id
|
||
with _ai_debug_log_lock:
|
||
_ai_debug_log_id += 1
|
||
entry["id"] = _ai_debug_log_id
|
||
entry.setdefault("ts", datetime.utcnow().isoformat(timespec="seconds") + "Z")
|
||
_ai_debug_log.append(entry)
|
||
|
||
|
||
# Shared mutable state (thread-safe reads are fine for these primitives)
|
||
_stats: dict[str, Any] = {
|
||
"total_scanned": 0,
|
||
"total_alerts": 0,
|
||
"last_cycle": "Never",
|
||
"engine_status": "Idle",
|
||
"uptime_start": time.time(),
|
||
}
|
||
|
||
# Set by any API write endpoint (add/edit/delete keyword, site, config).
|
||
# The scraper loop polls this every 5 s during its inter-cycle sleep and
|
||
# wakes up immediately so changes take effect on the very next cycle.
|
||
_cycle_now = threading.Event()
|
||
|
||
# Each agent entry is paired with matching Accept-Language / platform hints
|
||
# so the full HTTP header set is internally consistent — detectors check
|
||
# that UA, Accept-Language, and navigator.platform all agree.
|
||
_agent_profiles: list[dict] = [
|
||
{
|
||
"ua": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
||
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
||
"platform": "Win32", "vendor": "Google Inc.", "lang": "en-US,en;q=0.9",
|
||
"locale": "en-US", "tz": "America/New_York",
|
||
"viewport": (1920, 1080),
|
||
},
|
||
{
|
||
"ua": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
||
"(KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.2420.81",
|
||
"platform": "Win32", "vendor": "Google Inc.", "lang": "en-GB,en;q=0.9",
|
||
"locale": "en-GB", "tz": "Europe/London",
|
||
"viewport": (1440, 900),
|
||
},
|
||
{
|
||
"ua": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
||
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0",
|
||
"platform": "Win32", "vendor": "Google Inc.", "lang": "en-GB,en;q=0.9",
|
||
"locale": "en-GB", "tz": "Europe/London",
|
||
"viewport": (1366, 768),
|
||
},
|
||
{
|
||
"ua": "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_4_1) AppleWebKit/537.36 "
|
||
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
||
"platform": "MacIntel", "vendor": "Google Inc.", "lang": "en-US,en;q=0.9",
|
||
"locale": "en-US", "tz": "America/Los_Angeles",
|
||
"viewport": (1512, 982),
|
||
},
|
||
{
|
||
"ua": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0",
|
||
"platform": "Win32", "vendor": "", "lang": "en-US,en;q=0.5",
|
||
"locale": "en-US", "tz": "America/Chicago",
|
||
"viewport": (1280, 800),
|
||
},
|
||
]
|
||
# Keep a flat UA list for legacy callers (price refresh loop etc.)
|
||
_rotating_agents: list[str] = [p["ua"] for p in _agent_profiles]
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Humanisation helpers
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
def _jitter(base: float, pct: float = 0.35) -> float:
|
||
"""Return base ± pct%, minimum 0.3 s."""
|
||
spread = base * pct
|
||
return max(0.3, base + random.uniform(-spread, spread))
|
||
|
||
|
||
def _bezier_points(x0: int, y0: int, x1: int, y1: int, n: int) -> list[tuple[int,int]]:
|
||
"""
|
||
Generate n points along a cubic bezier curve between (x0,y0) and (x1,y1).
|
||
Two random control points give the path a natural arc rather than a
|
||
straight line — straight-line mouse movements are a strong bot signal.
|
||
"""
|
||
cx1 = x0 + random.randint(-120, 120)
|
||
cy1 = y0 + random.randint(-60, 60)
|
||
cx2 = x1 + random.randint(-120, 120)
|
||
cy2 = y1 + random.randint(-60, 60)
|
||
pts = []
|
||
for i in range(n):
|
||
t = i / (n - 1) if n > 1 else 0
|
||
mt = 1 - t
|
||
px = int(mt**3*x0 + 3*mt**2*t*cx1 + 3*mt*t**2*cx2 + t**3*x1)
|
||
py = int(mt**3*y0 + 3*mt**2*t*cy1 + 3*mt*t**2*cy2 + t**3*y1)
|
||
pts.append((px, py))
|
||
return pts
|
||
|
||
|
||
async def _human_mouse(page) -> None:
|
||
"""
|
||
Heavy human mouse simulation.
|
||
- Bezier-curve paths (not straight lines)
|
||
- Variable speed: slow at start/end, faster in the middle (ease-in-out)
|
||
- Random hover-pauses over elements as if reading them
|
||
- Occasional micro-tremors (hand shake)
|
||
- 5-9 movements total before scraping begins
|
||
"""
|
||
try:
|
||
vw = await page.evaluate("window.innerWidth")
|
||
vh = await page.evaluate("window.innerHeight")
|
||
x = random.randint(int(vw * 0.25), int(vw * 0.55))
|
||
y = random.randint(int(vh * 0.15), int(vh * 0.35))
|
||
|
||
for move_n in range(random.randint(5, 9)):
|
||
# Pick a target in the "content area" (not nav bars)
|
||
tx = random.randint(int(vw * 0.05), int(vw * 0.90))
|
||
ty = random.randint(int(vh * 0.10), int(vh * 0.85))
|
||
pts = _bezier_points(x, y, tx, ty, random.randint(18, 35))
|
||
|
||
for idx, (px, py) in enumerate(pts):
|
||
# Micro-tremor: tiny random offset every step
|
||
jx = px + random.randint(-2, 2)
|
||
jy = py + random.randint(-2, 2)
|
||
jx = max(1, min(vw - 1, jx))
|
||
jy = max(1, min(vh - 1, jy))
|
||
await page.mouse.move(jx, jy)
|
||
|
||
# Ease-in-out speed: slow at edges of path, fast in middle
|
||
progress = idx / len(pts)
|
||
speed = 0.5 - 0.4 * abs(progress - 0.5) * 2 # ~0.1 at ends, ~0.5 mid
|
||
await asyncio.sleep(random.uniform(speed * 0.015, speed * 0.045))
|
||
|
||
x, y = tx, ty
|
||
|
||
# After each movement: sometimes hover and "read" for a moment
|
||
if random.random() < 0.55:
|
||
await asyncio.sleep(random.uniform(0.3, 1.4))
|
||
|
||
# Occasionally: brief fast micro-movements (thinking/fidgeting)
|
||
if random.random() < 0.25:
|
||
for _ in range(random.randint(3, 7)):
|
||
await page.mouse.move(
|
||
x + random.randint(-8, 8),
|
||
y + random.randint(-8, 8),
|
||
)
|
||
await asyncio.sleep(random.uniform(0.02, 0.06))
|
||
|
||
await asyncio.sleep(random.uniform(0.05, 0.25))
|
||
|
||
except Exception:
|
||
pass # never let mouse errors kill the scrape
|
||
|
||
|
||
async def _human_scroll(page, steps: int = 5) -> None:
|
||
"""
|
||
Heavy human scroll simulation.
|
||
- Variable scroll distances (people don't scroll the same amount each time)
|
||
- Longer read-pauses mid-page (as if reading a listing title)
|
||
- Occasional scroll-back-up to re-check something
|
||
- Final scroll back toward top to simulate "I've seen enough" behaviour
|
||
- Scroll wheel events (mousewheel) not just JS scrollBy
|
||
"""
|
||
try:
|
||
vh = await page.evaluate("window.innerHeight")
|
||
total_scrolled = 0
|
||
|
||
for step in range(steps):
|
||
# Variable scroll: sometimes a quick skim, sometimes a slow read
|
||
dist = int(vh * random.uniform(0.30, 0.85))
|
||
|
||
# Use actual mouse wheel scroll — more realistic than JS scrollBy
|
||
# Wheel delta in px — browsers normalise this but detectors see it
|
||
await page.mouse.wheel(0, dist)
|
||
total_scrolled += dist
|
||
await asyncio.sleep(random.uniform(0.15, 0.35))
|
||
|
||
# Random read-pause: longer stop as if reading a result
|
||
if random.random() < 0.65:
|
||
await asyncio.sleep(random.uniform(0.6, 2.2))
|
||
|
||
# Occasional scroll-back-up (re-reading behaviour)
|
||
if random.random() < 0.30 and step > 0:
|
||
back = int(dist * random.uniform(0.25, 0.65))
|
||
await page.mouse.wheel(0, -back)
|
||
total_scrolled -= back
|
||
await asyncio.sleep(random.uniform(0.4, 1.0))
|
||
|
||
# Final: scroll back toward top (user finished scanning, ready to act)
|
||
if total_scrolled > vh:
|
||
await asyncio.sleep(random.uniform(0.5, 1.2))
|
||
# Scroll back in 2-3 steps rather than teleporting to top
|
||
scroll_back = int(total_scrolled * random.uniform(0.4, 0.75))
|
||
for _ in range(random.randint(2, 3)):
|
||
chunk = scroll_back // 2
|
||
await page.mouse.wheel(0, -chunk)
|
||
await asyncio.sleep(random.uniform(0.2, 0.5))
|
||
|
||
except Exception:
|
||
pass
|
||
|
||
|
||
def _build_stealth_script(profile: dict) -> str:
|
||
"""
|
||
Returns a comprehensive JS init script that patches 30+ navigator/
|
||
window properties checked by Cloudflare, DataDome, PerimeterX, and
|
||
similar bot-detection systems.
|
||
|
||
All patches use Object.defineProperty so they cannot be overwritten
|
||
by the site's own JS after the fact.
|
||
"""
|
||
platform = profile.get("platform", "Win32")
|
||
vendor = profile.get("vendor", "Google Inc.")
|
||
lang = profile.get("lang", "en-US,en;q=0.9")
|
||
# Accept-Language first token
|
||
lang0 = lang.split(",")[0].strip()
|
||
|
||
return f"""
|
||
(() => {{
|
||
// ── 1. Core webdriver / automation flags ──────────────────────────────
|
||
const def = (obj, prop, val) => {{
|
||
try {{ Object.defineProperty(obj, prop, {{ get: () => val, configurable: true }}); }}
|
||
catch(e) {{}}
|
||
}};
|
||
|
||
def(navigator, 'webdriver', undefined);
|
||
def(navigator, 'plugins', [
|
||
{{ name:'PDF Viewer', filename:'internal-pdf-viewer', description:'Portable Document Format', length:1 }},
|
||
{{ name:'Chrome PDF Viewer', filename:'internal-pdf-viewer', description:'Portable Document Format', length:1 }},
|
||
{{ name:'Chromium PDF Viewer',filename:'internal-pdf-viewer', description:'Portable Document Format', length:1 }},
|
||
{{ name:'Microsoft Edge PDF Viewer',filename:'internal-pdf-viewer',description:'Portable Document Format', length:1 }},
|
||
{{ name:'WebKit built-in PDF',filename:'internal-pdf-viewer', description:'Portable Document Format', length:1 }},
|
||
]);
|
||
def(navigator, 'languages', ['{lang0}', '{lang0.split("-")[0]}']);
|
||
def(navigator, 'platform', '{platform}');
|
||
def(navigator, 'vendor', '{vendor}');
|
||
def(navigator, 'hardwareConcurrency', {random.choice([4, 6, 8, 12, 16])});
|
||
def(navigator, 'deviceMemory', {random.choice([4, 8, 16])});
|
||
def(navigator, 'maxTouchPoints', 0);
|
||
def(navigator, 'cookieEnabled', true);
|
||
def(navigator, 'onLine', true);
|
||
def(navigator, 'doNotTrack', null);
|
||
|
||
// ── 2. Chrome runtime object — must be non-trivially populated ────────
|
||
if (!window.chrome) window.chrome = {{}};
|
||
window.chrome.runtime = window.chrome.runtime || {{
|
||
id: undefined,
|
||
connect: () => {{}},
|
||
sendMessage: () => {{}},
|
||
onMessage: {{ addListener: () => {{}} }},
|
||
}};
|
||
window.chrome.loadTimes = () => ({{
|
||
requestTime: Date.now() / 1000 - Math.random() * 0.3,
|
||
startLoadTime: Date.now() / 1000 - Math.random() * 0.2,
|
||
commitLoadTime: Date.now() / 1000 - Math.random() * 0.1,
|
||
finishDocumentLoadTime: Date.now() / 1000,
|
||
finishLoadTime: Date.now() / 1000,
|
||
firstPaintTime: Date.now() / 1000 - Math.random() * 0.05,
|
||
firstPaintAfterLoadTime: 0,
|
||
navigationType: 'Other',
|
||
wasFetchedViaSpdy: true,
|
||
wasNpnNegotiated: true,
|
||
npnNegotiatedProtocol: 'h2',
|
||
wasAlternateProtocolAvailable: false,
|
||
connectionInfo: 'h2',
|
||
}});
|
||
window.chrome.csi = () => ({{
|
||
startE: Date.now() - Math.floor(Math.random()*2000+500),
|
||
onloadT: Date.now() - Math.floor(Math.random()*200),
|
||
pageT: Math.random()*2000+300,
|
||
tran: 15,
|
||
}});
|
||
|
||
// ── 3. Permissions API — real browser returns 'granted'/'prompt' ──────
|
||
if (navigator.permissions) {{
|
||
const _query = navigator.permissions.query.bind(navigator.permissions);
|
||
navigator.permissions.query = (params) =>
|
||
params.name === 'notifications'
|
||
? Promise.resolve({{ state: Notification.permission }})
|
||
: _query(params);
|
||
}}
|
||
|
||
// ── 4. WebGL renderer — headless Chrome returns "SwiftShader" ─────────
|
||
// Real GPU names: "ANGLE (Intel, Mesa Intel(R) UHD...)" etc.
|
||
const _getCtx = HTMLCanvasElement.prototype.getContext;
|
||
HTMLCanvasElement.prototype.getContext = function(type, ...args) {{
|
||
const ctx = _getCtx.call(this, type, ...args);
|
||
if (type === 'webgl' || type === 'webgl2') {{
|
||
const _getPara = ctx.getParameter.bind(ctx);
|
||
ctx.getParameter = function(param) {{
|
||
if (param === 37445) return 'Intel Inc.';
|
||
if (param === 37446) return 'ANGLE (Intel, Intel(R) UHD Graphics 630 Direct3D11 vs_5_0 ps_5_0, D3D11)';
|
||
return _getPara(param);
|
||
}};
|
||
}}
|
||
return ctx;
|
||
}};
|
||
|
||
// ── 5. Canvas fingerprint noise — tiny random pixel perturbation ──────
|
||
const _toDataURL = HTMLCanvasElement.prototype.toDataURL;
|
||
HTMLCanvasElement.prototype.toDataURL = function(type) {{
|
||
const ctx2 = this.getContext('2d');
|
||
if (ctx2) {{
|
||
const img = ctx2.getImageData(0, 0, 1, 1);
|
||
img.data[0] ^= {random.randint(1, 8)};
|
||
ctx2.putImageData(img, 0, 0);
|
||
}}
|
||
return _toDataURL.call(this, type);
|
||
}};
|
||
|
||
// ── 6. Audio fingerprint noise ────────────────────────────────────────
|
||
const _createBuffer = AudioBuffer.prototype.getChannelData;
|
||
if (_createBuffer) {{
|
||
AudioBuffer.prototype.getChannelData = function(ch) {{
|
||
const data = _createBuffer.call(this, ch);
|
||
for (let i = 0; i < data.length; i += 1000)
|
||
data[i] += Math.random() * 0.0000001;
|
||
return data;
|
||
}};
|
||
}}
|
||
|
||
// ── 7. Screen / window — match viewport ───────────────────────────────
|
||
def(screen, 'width', window.innerWidth || {random.choice([1366,1440,1920])});
|
||
def(screen, 'height', window.innerHeight || {random.choice([768,900,1080])});
|
||
def(screen, 'availWidth', window.innerWidth || {random.choice([1366,1440,1920])});
|
||
def(screen, 'availHeight', (window.innerHeight || {random.choice([768,900,1080])}) - 40);
|
||
def(screen, 'colorDepth', 24);
|
||
def(screen, 'pixelDepth', 24);
|
||
def(window, 'devicePixelRatio', 1);
|
||
def(window, 'outerWidth', window.innerWidth);
|
||
def(window, 'outerHeight', window.innerHeight + {random.randint(85,110)});
|
||
|
||
// ── 8. Timing API — real browsers have nonzero connection timings ─────
|
||
if (window.PerformanceTiming) {{
|
||
const _now = Date.now();
|
||
const t = performance.timing;
|
||
['navigationStart','unloadEventStart','unloadEventEnd',
|
||
'redirectStart','redirectEnd','fetchStart','domainLookupStart',
|
||
'domainLookupEnd','connectStart','connectEnd','requestStart',
|
||
'responseStart','responseEnd','domLoading','domInteractive',
|
||
'domContentLoadedEventStart','domContentLoadedEventEnd',
|
||
'domComplete','loadEventStart','loadEventEnd'].forEach((k,i) => {{
|
||
try {{ Object.defineProperty(t, k, {{ get: () => _now - (19-i)*{random.randint(8,25)} }}); }}
|
||
catch(e) {{}}
|
||
}});
|
||
}}
|
||
|
||
// ── 9. Focus / visibility lock ────────────────────────────────────────
|
||
def(document, 'visibilityState', 'visible');
|
||
def(document, 'hidden', false);
|
||
document.hasFocus = () => true;
|
||
document.addEventListener('visibilitychange',
|
||
e => e.stopImmediatePropagation(), true);
|
||
|
||
// ── 10. iframe contentWindow.navigator.webdriver ─────────────────────
|
||
const _attach = HTMLIFrameElement.prototype.attachShadow;
|
||
try {{
|
||
Object.defineProperty(HTMLIFrameElement.prototype, 'contentWindow', {{
|
||
get: function() {{
|
||
const w = Object.getOwnPropertyDescriptor(
|
||
HTMLIFrameElement.prototype, 'contentWindow'
|
||
).get.call(this);
|
||
if (w && w.navigator)
|
||
def(w.navigator, 'webdriver', undefined);
|
||
return w;
|
||
}}
|
||
}});
|
||
}} catch(e) {{}}
|
||
|
||
// ── 11. Battery API ──────────────────────────────────────────────────
|
||
if (navigator.getBattery) {{
|
||
navigator.getBattery = () => Promise.resolve({{
|
||
charging: true, chargingTime: 0,
|
||
dischargingTime: Infinity, level: 0.77,
|
||
addEventListener: () => {{}},
|
||
}});
|
||
}}
|
||
|
||
// ── 12. Network Info ─────────────────────────────────────────────────
|
||
if (navigator.connection) {{
|
||
def(navigator.connection, 'rtt', 20);
|
||
def(navigator.connection, 'downlink', 25);
|
||
def(navigator.connection, 'effectiveType', '4g');
|
||
def(navigator.connection, 'saveData', false);
|
||
}}
|
||
|
||
// ── 13. Media devices ────────────────────────────────────────────────
|
||
if (navigator.mediaDevices && navigator.mediaDevices.enumerateDevices) {{
|
||
const _enumDev = navigator.mediaDevices.enumerateDevices
|
||
.bind(navigator.mediaDevices);
|
||
navigator.mediaDevices.enumerateDevices = () =>
|
||
_enumDev().then(d => d.length ? d : [
|
||
{{ kind:'audioinput', deviceId:'default', label:'', groupId:'' }},
|
||
{{ kind:'audiooutput', deviceId:'default', label:'', groupId:'' }},
|
||
]);
|
||
}}
|
||
|
||
// ── 14. Browser-specific: Yandex / Edge / Brave ──────────────────────
|
||
(function() {{
|
||
const ua = navigator.userAgent || '';
|
||
if (ua.includes('YaBrowser')) {{
|
||
window.yandex = window.yandex || {{}};
|
||
window.Ya = window.Ya || {{}};
|
||
def(navigator, 'vendor', 'Yandex');
|
||
}}
|
||
if (ua.includes('Edg/') && !ua.includes('Edge/')) {{
|
||
window.msWriteProfilerMark =
|
||
window.msWriteProfilerMark || (() => {{}});
|
||
}}
|
||
if (ua.includes('Brave')) {{
|
||
navigator.brave = {{ isBrave: () => Promise.resolve(true) }};
|
||
}}
|
||
}})();
|
||
|
||
}})();
|
||
"""
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Browser resolver — Edge → Yandex → Chromium fallback
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
def _resolve_browser() -> tuple[str, str]:
|
||
"""
|
||
Locate the browser binary to use for scraping.
|
||
|
||
Priority:
|
||
1. Read 'browser_choice' from the DB (set via Settings tab).
|
||
Choices: 'auto' | 'chrome' | 'edge' | 'yandex' | 'brave'
|
||
2. If choice is 'auto' (or unset), probe Edge → Yandex → Chromium.
|
||
3. For an explicit choice, search known install paths + PATH.
|
||
Falls back to Playwright Chromium if the chosen browser isn't found.
|
||
|
||
Both Edge, Yandex, Brave, and Chrome are Chromium-based, so Playwright
|
||
drives all of them via pw.chromium.launch(executable_path=...).
|
||
|
||
Returns (browser_label, executable_path_or_empty_string).
|
||
An empty path tells Playwright to use its own managed Chromium.
|
||
"""
|
||
import shutil
|
||
|
||
choice = _get_config("browser_choice", "auto").strip().lower()
|
||
|
||
# ── Candidate path tables ─────────────────────────────────────────────────
|
||
BROWSERS: dict[str, dict] = {
|
||
"chrome": {
|
||
"label": "Google Chrome",
|
||
"paths": [
|
||
r"C:\Program Files\Google\Chrome\Application\chrome.exe",
|
||
r"C:\Program Files (x86)\Google\Chrome\Application\chrome.exe",
|
||
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
|
||
"/usr/bin/google-chrome",
|
||
"/usr/bin/google-chrome-stable",
|
||
"/usr/bin/chromium-browser",
|
||
],
|
||
"which": ["google-chrome", "google-chrome-stable", "chromium-browser", "chrome"],
|
||
},
|
||
"edge": {
|
||
"label": "Microsoft Edge",
|
||
"paths": [
|
||
r"C:\Program Files (x86)\Microsoft\Edge\Application\msedge.exe",
|
||
r"C:\Program Files\Microsoft\Edge\Application\msedge.exe",
|
||
"/Applications/Microsoft Edge.app/Contents/MacOS/Microsoft Edge",
|
||
"/usr/bin/microsoft-edge",
|
||
"/usr/bin/microsoft-edge-stable",
|
||
],
|
||
"which": ["msedge", "microsoft-edge", "microsoft-edge-stable"],
|
||
},
|
||
"yandex": {
|
||
"label": "Yandex Browser",
|
||
"paths": [
|
||
rf"C:\Users\{os.environ.get('USERNAME', os.environ.get('USER', ''))}\AppData\Local\Yandex\YandexBrowser\Application\browser.exe",
|
||
r"C:\Program Files\Yandex\YandexBrowser\Application\browser.exe",
|
||
"/Applications/Yandex.app/Contents/MacOS/Yandex",
|
||
"/usr/bin/yandex-browser",
|
||
"/usr/bin/yandex-browser-stable",
|
||
],
|
||
"which": ["yandex-browser", "yandex-browser-stable"],
|
||
},
|
||
"brave": {
|
||
"label": "Brave Browser",
|
||
"paths": [
|
||
r"C:\Program Files\BraveSoftware\Brave-Browser\Application\brave.exe",
|
||
r"C:\Program Files (x86)\BraveSoftware\Brave-Browser\Application\brave.exe",
|
||
rf"C:\Users\{os.environ.get('USERNAME', os.environ.get('USER', ''))}\AppData\Local\BraveSoftware\Brave-Browser\Application\brave.exe",
|
||
"/Applications/Brave Browser.app/Contents/MacOS/Brave Browser",
|
||
"/usr/bin/brave-browser",
|
||
"/usr/bin/brave-browser-stable",
|
||
"/usr/bin/brave",
|
||
],
|
||
"which": ["brave-browser", "brave-browser-stable", "brave"],
|
||
},
|
||
}
|
||
|
||
def _find(key: str) -> tuple[str, str]:
|
||
"""Return (label, exe) for a named browser, or ('', '') if not found."""
|
||
entry = BROWSERS.get(key, {})
|
||
for path in entry.get("paths", []):
|
||
if os.path.isfile(path):
|
||
return (entry["label"], path)
|
||
for cmd in entry.get("which", []):
|
||
exe = shutil.which(cmd)
|
||
if exe:
|
||
return (entry["label"], exe)
|
||
return ("", "")
|
||
|
||
# ── Explicit choice ───────────────────────────────────────────────────────
|
||
if choice in BROWSERS:
|
||
label, exe = _find(choice)
|
||
if exe:
|
||
print(f"[Browser] ✅ {label} selected & found → {exe}")
|
||
return (label, exe)
|
||
else:
|
||
print(
|
||
f"[Browser] ⚠️ '{choice}' was selected in Settings but is not "
|
||
f"installed on this machine.\n"
|
||
f" Falling back to Playwright's managed Chromium.\n"
|
||
f" Install {BROWSERS[choice]['label']} and restart Ghost Node."
|
||
)
|
||
return ("Playwright Chromium", "")
|
||
|
||
# ── Auto-detect: Edge → Yandex → Chrome → Brave → Chromium ──────────────
|
||
print("[Browser] 🔍 Auto-detecting browser (Edge → Yandex → Chrome → Brave)…")
|
||
for key in ("edge", "yandex", "chrome", "brave"):
|
||
label, exe = _find(key)
|
||
if exe:
|
||
print(f"[Browser] ✅ Auto-selected: {label} → {exe}")
|
||
return (label, exe)
|
||
|
||
print(
|
||
"[Browser] ⚠️ No supported browser found. Using Playwright Chromium.\n"
|
||
" Install Edge, Yandex, Chrome, or Brave for a real browser experience."
|
||
)
|
||
return ("Playwright Chromium", "")
|
||
|
||
# Browser is resolved fresh each cycle inside nuclear_engine so Settings
|
||
# changes take effect on the next cycle without restarting Ghost Node.
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Telegram helpers
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
def _get_config(key: str, default: str = "") -> str:
|
||
db = SessionLocal()
|
||
try:
|
||
row = db.query(Config).filter(Config.key == key).first()
|
||
return row.value if row and row.value else default
|
||
finally:
|
||
db.close()
|
||
|
||
|
||
# ── N1 Proxy rotation ──────────────────────────────────────────────────────
|
||
class _RoundRobin:
|
||
"""Thread-safe round-robin counter for proxy rotation."""
|
||
def __init__(self): self._v = 0; self._lock = threading.Lock()
|
||
def get(self) -> int:
|
||
with self._lock: return self._v
|
||
def increment(self) -> None:
|
||
with self._lock: self._v += 1
|
||
|
||
_proxy_counter = _RoundRobin()
|
||
|
||
|
||
def _get_proxy() -> dict | None:
|
||
"""
|
||
Returns a Playwright proxy dict if proxy_enabled=true and proxy_list has entries.
|
||
Rotates through the list round-robin using a module-level counter.
|
||
Proxy URLs should be in the format: http://host:port or http://user:pass@host:port
|
||
Returns None if proxy is disabled or list is empty.
|
||
"""
|
||
if _get_config("proxy_enabled", "false").lower() != "true":
|
||
return None
|
||
raw = _get_config("proxy_list", "").strip()
|
||
if not raw:
|
||
return None
|
||
proxies = [p.strip() for p in raw.splitlines() if p.strip()]
|
||
if not proxies:
|
||
return None
|
||
# Round-robin rotation using a shared counter
|
||
idx = _proxy_counter.get() % len(proxies)
|
||
_proxy_counter.increment()
|
||
chosen = proxies[idx]
|
||
# Parse proxy URL into Playwright format
|
||
# Playwright expects: {"server": "http://host:port", "username": "...", "password": "..."}
|
||
import urllib.parse as _up
|
||
parsed = _up.urlparse(chosen)
|
||
proxy: dict = {"server": f"{parsed.scheme}://{parsed.hostname}:{parsed.port}"}
|
||
if parsed.username:
|
||
proxy["username"] = parsed.username
|
||
if parsed.password:
|
||
proxy["password"] = parsed.password
|
||
return proxy
|
||
|
||
|
||
async def send_telegram(message: str) -> bool:
|
||
"""
|
||
Pull token + chat_id fresh from the DB on every call so Settings-tab
|
||
changes are immediately active without a restart.
|
||
|
||
Error handling tiers:
|
||
HTTP 400 → 'chat not found' means the bot has no open session with
|
||
the user. Print an ACTION REQUIRED banner with exact steps.
|
||
HTTP 401 → Bad token — print the token prefix for comparison.
|
||
HTTP 4xx → Any other client error — print full Telegram JSON body.
|
||
Timeout / network → log and return False without raising so the
|
||
scraper continues to the next lot uninterrupted.
|
||
"""
|
||
token = _get_config("telegram_token")
|
||
chat_id = _get_config("telegram_chat_id")
|
||
|
||
if not token or not chat_id:
|
||
print("[Telegram] ⚠️ No token/chat_id in DB — save Settings first.")
|
||
return False
|
||
|
||
url = f"https://api.telegram.org/bot{token}/sendMessage"
|
||
try:
|
||
async with httpx.AsyncClient(timeout=15) as client:
|
||
r = await client.post(
|
||
url,
|
||
data={"chat_id": chat_id, "text": message, "parse_mode": "HTML"},
|
||
)
|
||
if r.status_code == 200:
|
||
print(f"[Telegram] ✅ Alert sent to chat {chat_id}")
|
||
return True
|
||
|
||
# ── Parse the JSON error body Telegram always returns ────────────
|
||
try:
|
||
err_body = r.json()
|
||
except Exception:
|
||
err_body = {"description": r.text}
|
||
|
||
description = err_body.get("description", "").lower()
|
||
|
||
if r.status_code == 400 and "chat not found" in description:
|
||
# ── Actionable guidance for the most common setup mistake ─────
|
||
print(
|
||
f"\n[Telegram] ❌ HTTP 400 — chat not found\n"
|
||
f" Chat ID in DB : {chat_id}\n"
|
||
f" ──────────────────────────────────────────────\n"
|
||
f" ACTION REQUIRED — three steps to fix this:\n"
|
||
f" 1. Open Telegram and find your bot.\n"
|
||
f" 2. Press START or send /start to it.\n"
|
||
f" (Bots cannot message you first — you must\n"
|
||
f" open the conversation from your side.)\n"
|
||
f" 3. If using a group, the Chat ID must begin\n"
|
||
f" with a minus sign, e.g. -100123456789.\n"
|
||
f" ──────────────────────────────────────────────\n"
|
||
f" Telegram raw: {err_body}\n"
|
||
)
|
||
elif r.status_code == 401:
|
||
print(
|
||
f"[Telegram] ❌ HTTP 401 Unauthorized\n"
|
||
f" Token prefix in DB: '{token[:12]}…'\n"
|
||
f" ACTION REQUIRED: Verify the Bot Token in Settings.\n"
|
||
f" Telegram raw: {err_body}"
|
||
)
|
||
else:
|
||
# Generic non-200 — print everything for easy debugging
|
||
print(
|
||
f"[Telegram] ❌ HTTP {r.status_code} — "
|
||
f"token='{token[:12]}…' chat='{chat_id}'\n"
|
||
f" Telegram says: {err_body}"
|
||
)
|
||
return False
|
||
|
||
except httpx.TimeoutException:
|
||
# ── Never let a Telegram timeout crash the scraper loop ──────────────
|
||
print("[Telegram] ❌ Request timed out — check network. Scraper continues.")
|
||
return False
|
||
except httpx.RequestError as exc:
|
||
print(f"[Telegram] ❌ Network error: {exc}. Scraper continues.")
|
||
return False
|
||
except Exception as exc:
|
||
print(f"[Telegram] ❌ Unexpected error: {type(exc).__name__}: {exc}. Scraper continues.")
|
||
return False
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# N10 — Multi-channel alert dispatcher
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
async def _send_discord(message: str) -> bool:
|
||
"""Send a message to a Discord webhook URL."""
|
||
webhook = _get_config("discord_webhook")
|
||
if not webhook:
|
||
print("[Discord] ⚠️ No webhook URL saved.")
|
||
return False
|
||
# Discord uses 'content' not Telegram HTML — strip basic HTML tags
|
||
plain = re.sub(r"<[^>]+>", "", message).strip()
|
||
try:
|
||
async with httpx.AsyncClient(timeout=15) as client:
|
||
r = await client.post(webhook, json={"content": plain[:2000]})
|
||
if r.status_code in (200, 204):
|
||
print("[Discord] ✅ Alert sent.")
|
||
return True
|
||
print(f"[Discord] ❌ HTTP {r.status_code}: {r.text[:200]}")
|
||
return False
|
||
except Exception as exc:
|
||
print(f"[Discord] ❌ {exc}")
|
||
return False
|
||
|
||
|
||
async def _send_email(message: str, subject: str = "Ghost Node Alert") -> bool:
|
||
"""Send an alert via Gmail using an App Password.
|
||
Requires: gmail_address + gmail_app_password + email_to in Config.
|
||
Get an App Password at: myaccount.google.com/apppasswords
|
||
(requires 2-Step Verification to be enabled on your Google account).
|
||
"""
|
||
import smtplib
|
||
from email.mime.text import MIMEText
|
||
gmail_addr = _get_config("gmail_address", "").strip()
|
||
app_pass = _get_config("gmail_app_password", "").strip()
|
||
to = _get_config("email_to", "").strip()
|
||
if not all([gmail_addr, app_pass, to]):
|
||
print("[Email] ⚠️ Gmail not configured — set gmail_address, gmail_app_password, email_to in Settings.")
|
||
return False
|
||
plain = re.sub(r"<[^>]+>", "", message).strip()
|
||
def _do_send():
|
||
msg = MIMEText(plain, "plain", "utf-8")
|
||
msg["Subject"] = subject
|
||
msg["From"] = gmail_addr
|
||
msg["To"] = to
|
||
with smtplib.SMTP("smtp.gmail.com", 587, timeout=20) as s:
|
||
s.starttls()
|
||
s.login(gmail_addr, app_pass)
|
||
s.sendmail(gmail_addr, [to], msg.as_string())
|
||
try:
|
||
loop = asyncio.get_event_loop()
|
||
await loop.run_in_executor(None, _do_send)
|
||
print(f"[Email] ✅ Alert sent to {to}")
|
||
return True
|
||
except Exception as exc:
|
||
print(f"[Email] ❌ {exc}")
|
||
return False
|
||
|
||
|
||
async def send_alert(message: str, subject: str = "Ghost Node Alert") -> None:
|
||
"""
|
||
Route an alert to all channels the user has enabled.
|
||
Channels are a comma-separated list in Config: e.g. "telegram,discord"
|
||
Each channel is tried independently — one failure doesn't stop others.
|
||
"""
|
||
channels_raw = _get_config("alert_channels", "telegram")
|
||
channels = [c.strip().lower() for c in channels_raw.split(",") if c.strip()]
|
||
tasks = []
|
||
for ch in channels:
|
||
if ch == "telegram":
|
||
tasks.append(send_telegram(message))
|
||
elif ch == "discord":
|
||
tasks.append(_send_discord(message))
|
||
elif ch == "email":
|
||
tasks.append(_send_email(message, subject))
|
||
if tasks:
|
||
await asyncio.gather(*tasks, return_exceptions=True)
|
||
|
||
|
||
async def get_telegram_updates(offset: int) -> list[dict]:
|
||
token = _get_config("telegram_token")
|
||
if not token:
|
||
return []
|
||
url = f"https://api.telegram.org/bot{token}/getUpdates"
|
||
try:
|
||
async with httpx.AsyncClient(timeout=20) as client:
|
||
r = await client.get(url, params={"offset": offset, "timeout": 10})
|
||
if r.status_code == 200:
|
||
return r.json().get("result", [])
|
||
else:
|
||
print(f"[Telegram C2] ❌ getUpdates HTTP {r.status_code}: {r.text}")
|
||
except Exception as exc:
|
||
print(f"[Telegram C2] ❌ getUpdates error: {exc}")
|
||
return []
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Thread C — Telegram C2 Polling
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
async def telegram_c2_loop() -> None:
|
||
offset = 0
|
||
print("[Thread C] Telegram C2 online.")
|
||
while True:
|
||
try:
|
||
updates = await get_telegram_updates(offset)
|
||
for upd in updates:
|
||
offset = upd["update_id"] + 1
|
||
msg = upd.get("message", {})
|
||
text = msg.get("text", "").strip()
|
||
chat_id_upd = msg.get("chat", {}).get("id")
|
||
if not chat_id_upd:
|
||
continue
|
||
|
||
if text == "/status":
|
||
uptime_secs = int(time.time() - _stats["uptime_start"])
|
||
h, rem = divmod(uptime_secs, 3600)
|
||
m, s = divmod(rem, 60)
|
||
report = (
|
||
"🕵️ <b>Ghost Node — Health Report</b>\n"
|
||
f"━━━━━━━━━━━━━━━━━━━━\n"
|
||
f"🟢 Engine: {_stats['engine_status']}\n"
|
||
f"📡 Scanned: {_stats['total_scanned']} listings\n"
|
||
f"🚨 Alerts sent: {_stats['total_alerts']}\n"
|
||
f"🔄 Last cycle: {_stats['last_cycle']}\n"
|
||
f"⏱️ Uptime: {h:02d}h {m:02d}m {s:02d}s\n"
|
||
f"🔀 Proxy: {'ON' if _get_config('proxy_enabled','false').lower()=='true' else 'OFF'}\n"
|
||
f"🖥️ Host OS: {platform.system()} {platform.release()}"
|
||
)
|
||
await send_telegram(report)
|
||
|
||
elif text == "/pause":
|
||
_stats["engine_status"] = "Paused"
|
||
await send_telegram("⏸️ Engine paused.")
|
||
|
||
elif text == "/resume":
|
||
_stats["engine_status"] = "Running"
|
||
await send_telegram("▶️ Engine resumed.")
|
||
|
||
elif text == "/listings":
|
||
db = SessionLocal()
|
||
try:
|
||
rows = db.query(Listing).order_by(Listing.timestamp.desc()).limit(5).all()
|
||
if rows:
|
||
lines = "\n".join(
|
||
f"• {r.title[:40]} — £{r.price or '?'} (score {r.score})"
|
||
for r in rows
|
||
)
|
||
await send_telegram(f"📋 <b>Last 5 Listings:</b>\n{lines}")
|
||
else:
|
||
await send_telegram("No listings found yet.")
|
||
finally:
|
||
db.close()
|
||
|
||
elif text == "/top5":
|
||
db = SessionLocal()
|
||
try:
|
||
rows = db.query(Listing).order_by(Listing.score.desc(), Listing.timestamp.desc()).limit(5).all()
|
||
if rows:
|
||
lines = []
|
||
for i, r in enumerate(rows, 1):
|
||
price_str = f"{r.currency or ''}{r.price:.0f}" if r.price else "?"
|
||
lines.append(f"{i}. {r.title[:45]}\n 💰 {price_str} | ⭐ {r.score} | 🌐 {r.site_name or '?'}")
|
||
await send_telegram("🏆 <b>Top 5 by Score:</b>\n\n" + "\n\n".join(lines))
|
||
else:
|
||
await send_telegram("No listings found yet.")
|
||
finally:
|
||
db.close()
|
||
|
||
elif text == "/sites":
|
||
db = SessionLocal()
|
||
try:
|
||
sites = db.query(TargetSite).all()
|
||
if sites:
|
||
lines = []
|
||
for s in sites:
|
||
status = "🟢" if s.enabled == 1 else "🔴"
|
||
health = f"⚠️ {s.consecutive_failures} fails" if (s.consecutive_failures or 0) >= 3 else "✓"
|
||
lines.append(f"{status} {s.name} — {health}")
|
||
await send_telegram("🌐 <b>Target Sites:</b>\n" + "\n".join(lines))
|
||
else:
|
||
await send_telegram("No sites configured.")
|
||
finally:
|
||
db.close()
|
||
|
||
elif text.startswith("/alert "):
|
||
# /alert on <keyword> or /alert off <keyword>
|
||
parts = text.split(" ", 2)
|
||
if len(parts) == 3 and parts[1].lower() in ("on", "off"):
|
||
action, kw_term = parts[1].lower(), parts[2].strip()
|
||
db = SessionLocal()
|
||
try:
|
||
kw = db.query(Keyword).filter(Keyword.term.ilike(kw_term)).first()
|
||
if kw:
|
||
kw.weight = abs(kw.weight) if action == "on" else -abs(kw.weight)
|
||
db.flush()
|
||
db.commit()
|
||
await send_telegram(f"✅ Keyword '<b>{kw.term}</b>' alerts turned <b>{action.upper()}</b>.")
|
||
else:
|
||
await send_telegram(f"❌ Keyword '<b>{kw_term}</b>' not found. Use /keywords to list all.")
|
||
finally:
|
||
db.close()
|
||
else:
|
||
await send_telegram("Usage: /alert on <keyword> or /alert off <keyword>")
|
||
|
||
elif text == "/keywords":
|
||
db = SessionLocal()
|
||
try:
|
||
kws = db.query(Keyword).all()
|
||
if kws:
|
||
lines = [f"{'🟢' if (k.weight or 0) > 0 else '🔴'} {k.term} (weight {k.weight})" for k in kws]
|
||
await send_telegram("🔍 <b>Keywords:</b>\n" + "\n".join(lines))
|
||
else:
|
||
await send_telegram("No keywords configured.")
|
||
finally:
|
||
db.close()
|
||
|
||
elif text == "/help":
|
||
await send_telegram(
|
||
"🕵️ <b>Ghost Node Commands:</b>\n"
|
||
"/status — engine health\n"
|
||
"/listings — last 5 captures\n"
|
||
"/top5 — top 5 by score\n"
|
||
"/sites — site health\n"
|
||
"/keywords — all keywords\n"
|
||
"/alert on <kw> — re-enable keyword\n"
|
||
"/alert off <kw> — mute keyword\n"
|
||
"/pause — pause engine\n"
|
||
"/resume — resume engine"
|
||
)
|
||
|
||
except Exception as exc:
|
||
print(f"[Thread C] Error: {exc}")
|
||
|
||
await asyncio.sleep(3)
|
||
|
||
|
||
def timeLeftToMins(tl: str) -> float:
|
||
"""
|
||
Convert '2d 4h 30m 45s' string to total minutes as a float.
|
||
Includes seconds so the countdown is accurate to the second.
|
||
Returns float('inf') if nothing parseable (no time data scraped).
|
||
"""
|
||
if not tl or not tl.strip():
|
||
return float("inf")
|
||
mins = 0.0
|
||
import re as _re
|
||
d = _re.search(r"(\d+)\s*d", tl)
|
||
if d: mins += int(d.group(1)) * 1440
|
||
h = _re.search(r"(\d+)\s*h", tl)
|
||
if h: mins += int(h.group(1)) * 60
|
||
m = _re.search(r"(\d+)\s*m(?!s)", tl) # 'm' but not 'ms'
|
||
if m: mins += int(m.group(1))
|
||
s = _re.search(r"(\d+)\s*s(?!\w)", tl) # bare seconds
|
||
if s: mins += int(s.group(1)) / 60.0
|
||
return mins if mins > 0 else float("inf")
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Thread B — Nuclear Scraper Engine
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
# Currency symbol / code → ISO code map
|
||
_CURRENCY_MAP: list[tuple[str, str]] = [
|
||
("CA$", "CAD"), # must come before bare "$"
|
||
("CAD", "CAD"),
|
||
("US$", "USD"),
|
||
("USD", "USD"),
|
||
("$", "USD"), # default bare $ → USD
|
||
("£", "GBP"),
|
||
("GBP", "GBP"),
|
||
("€", "EUR"),
|
||
("EUR", "EUR"),
|
||
("AU$", "AUD"),
|
||
("AUD", "AUD"),
|
||
("NZ$", "NZD"),
|
||
("NZD", "NZD"),
|
||
("CHF", "CHF"),
|
||
("SEK", "SEK"),
|
||
("NOK", "NOK"),
|
||
("DKK", "DKK"),
|
||
("JPY", "JPY"),
|
||
("¥", "JPY"),
|
||
("CNY", "CNY"),
|
||
("HKD", "HKD"),
|
||
("MXN", "MXN"),
|
||
("BRL", "BRL"),
|
||
("INR", "INR"),
|
||
("₹", "INR"),
|
||
]
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# N16 — AI Filter Engine (Groq + Ollama)
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
def _build_ai_prompt(title: str, ai_target: str) -> str:
|
||
"""
|
||
Build a compact, token-efficient prompt for lot classification.
|
||
We keep it short to maximise Groq free-tier token budget.
|
||
"""
|
||
return (
|
||
"You are an auction lot classifier. Decide if this lot matches what the user wants.\n"
|
||
"Rules:\n"
|
||
" - Only return YES or NO followed by a colon and a very short reason (max 12 words).\n"
|
||
" - Be strict. Accessories, cases, chargers, covers, screen protectors, keyboards, "
|
||
"manuals, boxes, cables, stands, docks, and any non-device items are always NO.\n"
|
||
" - If the lot title is vague or unclear, return NO.\n"
|
||
" - Format exactly: YES: reason OR NO: reason\n\n"
|
||
f"User wants: {ai_target}\n"
|
||
f"Lot title: {title}\n"
|
||
"Answer:"
|
||
)
|
||
|
||
|
||
def _ai_debug_enabled() -> bool:
|
||
"""Returns True if ai_debug config key is set to 'true'."""
|
||
return _get_config("ai_debug", "false").strip().lower() == "true"
|
||
|
||
def _ai_debug_print(tag: str, label: str, content: str, max_chars: int = 2000) -> None:
|
||
"""Print a clearly-framed AI debug block to console."""
|
||
sep = "─" * 60
|
||
print(f"\n[{tag}] ┌{sep}")
|
||
print(f"[{tag}] │ {label}")
|
||
print(f"[{tag}] ├{sep}")
|
||
for line in content[:max_chars].splitlines():
|
||
print(f"[{tag}] │ {line}")
|
||
if len(content) > max_chars:
|
||
print(f"[{tag}] │ ... (truncated — {len(content)} chars total)")
|
||
print(f"[{tag}] └{sep}\n")
|
||
|
||
|
||
async def _ai_call_groq(prompt: str, api_key: str, model: str, _ctx: dict | None = None) -> tuple[bool, str]:
|
||
"""Call Groq's OpenAI-compatible free API. Returns (match, reason).
|
||
_ctx: optional metadata dict injected into the debug log entry (e.g. title, call_type).
|
||
"""
|
||
if not api_key:
|
||
print("[AI] ⚠️ Groq API key not set — skipping AI filter.")
|
||
return True, "no-key"
|
||
_debug = _ai_debug_enabled()
|
||
_resolved_model = model or "llama-3.3-70b-versatile"
|
||
if _debug:
|
||
_ai_debug_print("AI-DEBUG", f"GROQ REQUEST model={_resolved_model}", prompt)
|
||
_ai_log_entry({
|
||
"call_type": (_ctx or {}).get("call_type", "filter"),
|
||
"direction": "request",
|
||
"provider": "groq",
|
||
"model": _resolved_model,
|
||
"content": prompt,
|
||
"title": (_ctx or {}).get("title"),
|
||
"site": (_ctx or {}).get("site"),
|
||
})
|
||
try:
|
||
async with httpx.AsyncClient(timeout=20) as client:
|
||
r = await client.post(
|
||
"https://api.groq.com/openai/v1/chat/completions",
|
||
headers={
|
||
"Authorization": f"Bearer {api_key}",
|
||
"Content-Type": "application/json",
|
||
},
|
||
json={
|
||
"model": _resolved_model,
|
||
"messages": [{"role": "user", "content": prompt}],
|
||
"max_tokens": 40,
|
||
"temperature": 0.0,
|
||
},
|
||
)
|
||
if r.status_code == 200:
|
||
rj = r.json()
|
||
raw_text = rj["choices"][0]["message"]["content"].strip()
|
||
usage = rj.get("usage", {})
|
||
tok_p = usage.get("prompt_tokens")
|
||
tok_c = usage.get("completion_tokens")
|
||
if _debug:
|
||
_ai_debug_print("AI-DEBUG", f"GROQ RESPONSE (tokens: prompt={tok_p} completion={tok_c})", raw_text)
|
||
match_v = raw_text.upper().startswith("YES")
|
||
_ai_log_entry({
|
||
"call_type": (_ctx or {}).get("call_type", "filter"),
|
||
"direction": "response",
|
||
"provider": "groq",
|
||
"model": _resolved_model,
|
||
"content": raw_text,
|
||
"tokens_prompt": tok_p,
|
||
"tokens_completion": tok_c,
|
||
"verdict": "YES" if match_v else "NO",
|
||
"title": (_ctx or {}).get("title"),
|
||
"site": (_ctx or {}).get("site"),
|
||
})
|
||
match = raw_text.upper().startswith("YES")
|
||
reason = raw_text.split(":", 1)[1].strip() if ":" in raw_text else raw_text[:80]
|
||
return match, reason[:200]
|
||
elif r.status_code == 429:
|
||
print("[AI] ⚠️ Groq rate limit hit — passing lot through.")
|
||
if _debug:
|
||
_ai_debug_print("AI-DEBUG", "GROQ RATE LIMIT RESPONSE (429)", r.text)
|
||
_ai_log_entry({"call_type": "filter", "direction": "error", "provider": "groq",
|
||
"model": _resolved_model, "content": "429 Rate limit hit", "status_code": 429})
|
||
return True, "rate-limit"
|
||
else:
|
||
print(f"[AI] ❌ Groq HTTP {r.status_code}: {r.text[:200]}")
|
||
if _debug:
|
||
_ai_debug_print("AI-DEBUG", f"GROQ ERROR RESPONSE ({r.status_code})", r.text)
|
||
_ai_log_entry({"call_type": "filter", "direction": "error", "provider": "groq",
|
||
"model": _resolved_model, "content": r.text[:500], "status_code": r.status_code})
|
||
return True, f"api-error-{r.status_code}"
|
||
except Exception as exc:
|
||
print(f"[AI] ❌ Groq call failed: {exc}")
|
||
if _debug:
|
||
_ai_log_entry({"call_type": "filter", "direction": "error", "provider": "groq",
|
||
"model": _resolved_model, "content": str(exc)})
|
||
return True, "exception"
|
||
|
||
|
||
async def _ai_call_ollama(prompt: str, model: str, base_url: str, _ctx: dict | None = None) -> tuple[bool, str]:
|
||
"""Call a local Ollama instance. Returns (match, reason).
|
||
_ctx: optional metadata dict injected into the debug log entry.
|
||
"""
|
||
url = f"{base_url.rstrip('/')}/api/generate"
|
||
_debug = _ai_debug_enabled()
|
||
_resolved_model = model or "llama3.2:3b"
|
||
if _debug:
|
||
_ai_debug_print("AI-DEBUG", f"OLLAMA REQUEST url={url} model={_resolved_model}", prompt)
|
||
_ai_log_entry({
|
||
"call_type": (_ctx or {}).get("call_type", "filter"),
|
||
"direction": "request",
|
||
"provider": "ollama",
|
||
"model": _resolved_model,
|
||
"content": prompt,
|
||
"title": (_ctx or {}).get("title"),
|
||
"site": (_ctx or {}).get("site"),
|
||
})
|
||
try:
|
||
async with httpx.AsyncClient(timeout=60) as client:
|
||
r = await client.post(
|
||
url,
|
||
json={"model": _resolved_model, "prompt": prompt, "stream": False},
|
||
)
|
||
if r.status_code == 200:
|
||
rj = r.json()
|
||
raw_text = rj.get("response", "").strip()
|
||
# Ollama eval_count ≈ completion tokens; prompt_eval_count ≈ prompt tokens
|
||
tok_p = rj.get("prompt_eval_count")
|
||
tok_c = rj.get("eval_count")
|
||
if _debug:
|
||
_ai_debug_print("AI-DEBUG", f"OLLAMA RESPONSE (tokens: prompt={tok_p} completion={tok_c})", raw_text)
|
||
match_v = raw_text.upper().startswith("YES")
|
||
_ai_log_entry({
|
||
"call_type": (_ctx or {}).get("call_type", "filter"),
|
||
"direction": "response",
|
||
"provider": "ollama",
|
||
"model": _resolved_model,
|
||
"content": raw_text,
|
||
"tokens_prompt": tok_p,
|
||
"tokens_completion": tok_c,
|
||
"verdict": "YES" if match_v else "NO",
|
||
"title": (_ctx or {}).get("title"),
|
||
"site": (_ctx or {}).get("site"),
|
||
})
|
||
match = raw_text.upper().startswith("YES")
|
||
reason = raw_text.split(":", 1)[1].strip() if ":" in raw_text else raw_text[:80]
|
||
return match, reason[:200]
|
||
else:
|
||
print(f"[AI] ❌ Ollama HTTP {r.status_code}: {r.text[:200]}")
|
||
if _debug:
|
||
_ai_debug_print("AI-DEBUG", f"OLLAMA ERROR RESPONSE ({r.status_code})", r.text)
|
||
_ai_log_entry({"call_type": "filter", "direction": "error", "provider": "ollama",
|
||
"model": _resolved_model, "content": r.text[:500], "status_code": r.status_code})
|
||
return True, f"ollama-error-{r.status_code}"
|
||
except httpx.ConnectError:
|
||
print(f"[AI] ❌ Ollama not reachable at {base_url} — is it running? Passing lot through.")
|
||
if _debug:
|
||
_ai_log_entry({"call_type": "filter", "direction": "error", "provider": "ollama",
|
||
"model": _resolved_model, "content": f"Connection refused at {base_url}"})
|
||
return True, "ollama-offline"
|
||
except Exception as exc:
|
||
print(f"[AI] ❌ Ollama call failed: {exc}")
|
||
if _debug:
|
||
_ai_log_entry({"call_type": "filter", "direction": "error", "provider": "ollama",
|
||
"model": _resolved_model, "content": str(exc)})
|
||
return True, "exception"
|
||
|
||
|
||
async def _ai_analyze(title: str, ai_target: str) -> tuple[bool, str]:
|
||
"""
|
||
Main AI dispatch. Reads provider from config each call so Settings
|
||
changes take effect without restart.
|
||
|
||
Returns (match: bool, reason: str).
|
||
If AI is misconfigured or errors, defaults to True (pass through)
|
||
so the scraper never silently drops listings.
|
||
"""
|
||
provider = _get_config("ai_provider", "groq").strip().lower()
|
||
if provider == "none":
|
||
return True, ""
|
||
|
||
prompt = _build_ai_prompt(title, ai_target)
|
||
model = _get_config("ai_model", "").strip()
|
||
|
||
_ctx = {"call_type": "filter", "title": title}
|
||
if provider == "groq":
|
||
api_key = _get_config("ai_api_key", "").strip()
|
||
if not model:
|
||
model = "llama-3.3-70b-versatile"
|
||
match, reason = await _ai_call_groq(prompt, api_key, model, _ctx=_ctx)
|
||
elif provider == "ollama":
|
||
base_url = _get_config("ai_base_url", "http://localhost:11434").strip()
|
||
if not model:
|
||
model = "llama3.2:3b"
|
||
match, reason = await _ai_call_ollama(prompt, model, base_url, _ctx=_ctx)
|
||
else:
|
||
return True, ""
|
||
|
||
verdict = "✅ YES" if match else "❌ NO"
|
||
print(f"[AI] {verdict} — {title[:60]} → {reason}")
|
||
return match, reason
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# N17 — Auto-Adapter: AI-powered CSS selector generator
|
||
# Supports: Groq (free cloud) + Ollama (local unlimited)
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
def _clean_html_for_ai(raw_html: str, max_chars: int = 14000) -> str:
|
||
"""
|
||
Strip everything that wastes tokens and confuses the AI:
|
||
- <script> and <style> blocks (content removed entirely)
|
||
- SVG blobs
|
||
- HTML comments
|
||
- Inline event handlers (on*)
|
||
- data: URIs (they are enormous)
|
||
- Excessive whitespace
|
||
Keeps class names, id attributes, aria-label, data-* attributes
|
||
because those are what CSS selectors are built from.
|
||
Then tries to isolate the main content area before truncating.
|
||
"""
|
||
# Remove block-level noise
|
||
raw_html = re.sub(r'<script[^>]*>.*?</script>', '', raw_html, flags=re.DOTALL | re.IGNORECASE)
|
||
raw_html = re.sub(r'<style[^>]*>.*?</style>', '', raw_html, flags=re.DOTALL | re.IGNORECASE)
|
||
raw_html = re.sub(r'<svg[^>]*>.*?</svg>', '<svg/>', raw_html, flags=re.DOTALL | re.IGNORECASE)
|
||
raw_html = re.sub(r'<!--.*?-->', '', raw_html, flags=re.DOTALL)
|
||
# Remove event handlers + data URIs
|
||
raw_html = re.sub(r'\s+on\w+="[^"]*"', '', raw_html)
|
||
raw_html = re.sub(r'\s+on\w+=\'[^\']*\'', '', raw_html)
|
||
raw_html = re.sub(r'(src|href|style)="data:[^"]*"', r'\1="data:..."', raw_html)
|
||
# Collapse whitespace
|
||
raw_html = re.sub(r'\s{2,}', ' ', raw_html).strip()
|
||
|
||
# Try to isolate the main content region (listings are usually here)
|
||
for main_pat in [
|
||
r'<main[^>]*>(.*?)</main>',
|
||
r'<\w+[^>]+role=["\']main["\'][^>]*>(.*?)</\w+>',
|
||
r'<\w+[^>]+id=["\'](?:content|main|results|listings|items|products)["\'][^>]*>(.*?)(?=<(?:footer|aside))',
|
||
r'<\w+[^>]+class=["\'][^"\']*(?:results|listings|items|products|catalog)[^"\']*["\'][^>]*>(.*?)(?=<(?:footer|aside))',
|
||
]:
|
||
m = re.search(main_pat, raw_html, re.DOTALL | re.IGNORECASE)
|
||
if m and len(m.group(1)) > 500:
|
||
return m.group(1)[:max_chars]
|
||
|
||
return raw_html[:max_chars]
|
||
|
||
|
||
def _build_selector_prompt(cleaned_html: str, site_name: str) -> str:
|
||
return f"""You are an expert web scraping engineer analyzing an auction/bidding website called "{site_name}".
|
||
|
||
Study the HTML below and find the CSS selectors for extracting auction listings.
|
||
|
||
RULES:
|
||
1. "container" = the CSS selector that matches EACH repeated listing card/item (must return 3+ elements)
|
||
2. "title_sel", "price_sel", "time_sel", "link_sel" = selectors RELATIVE to container (use querySelector inside the container)
|
||
3. For link_sel: if the container element itself is an <a> tag, write "self". Otherwise the CSS selector for the <a> inside the container.
|
||
4. For next_page_sel: the button or link to go to the next page. null if not found or not applicable.
|
||
5. If you cannot find a reliable selector for a field, use null.
|
||
6. Use the most specific and stable selector possible (prefer id > data-* attributes > class names).
|
||
7. Return ONLY a valid JSON object. No explanation, no markdown, no code blocks. Just the JSON.
|
||
|
||
REQUIRED JSON FORMAT:
|
||
{{"container": "...", "title_sel": "...", "price_sel": "...", "time_sel": "...", "link_sel": "...", "next_page_sel": "..."}}
|
||
|
||
HTML:
|
||
{cleaned_html}"""
|
||
|
||
|
||
async def _generate_selectors_ai(cleaned_html: str, site_name: str) -> dict | None:
|
||
"""
|
||
Send cleaned HTML to the configured AI provider and parse the returned
|
||
JSON selector map. Returns a dict with keys matching SiteSelectors fields,
|
||
or None on failure.
|
||
"""
|
||
provider = _get_config("ai_provider", "groq").strip().lower()
|
||
model = _get_config("ai_model", "").strip()
|
||
prompt = _build_selector_prompt(cleaned_html, site_name)
|
||
|
||
_debug = _ai_debug_enabled()
|
||
if _debug:
|
||
_ai_debug_print("AI-DEBUG", f"AUTO-ADAPT PROMPT site={site_name} provider={provider} model={model} html_chars={len(cleaned_html)}", prompt, max_chars=3000)
|
||
_ai_log_entry({
|
||
"call_type": "adapt",
|
||
"direction": "request",
|
||
"provider": provider,
|
||
"model": model,
|
||
"content": prompt,
|
||
"site": site_name,
|
||
})
|
||
|
||
raw_response = ""
|
||
if provider == "groq":
|
||
api_key = _get_config("ai_api_key", "").strip()
|
||
if not model: model = "llama-3.3-70b-versatile"
|
||
try:
|
||
async with httpx.AsyncClient(timeout=40) as client:
|
||
r = await client.post(
|
||
"https://api.groq.com/openai/v1/chat/completions",
|
||
headers={"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"},
|
||
json={"model": model, "messages": [{"role": "user", "content": prompt}],
|
||
"max_tokens": 500, "temperature": 0.0},
|
||
)
|
||
if r.status_code == 200:
|
||
rj = r.json()
|
||
raw_response = rj["choices"][0]["message"]["content"].strip()
|
||
usage = rj.get("usage", {})
|
||
tok_p = usage.get("prompt_tokens")
|
||
tok_c = usage.get("completion_tokens")
|
||
if _debug:
|
||
_ai_debug_print("AI-DEBUG", f"AUTO-ADAPT GROQ RESPONSE (tokens: prompt={tok_p} completion={tok_c})", raw_response)
|
||
_ai_log_entry({
|
||
"call_type": "adapt",
|
||
"direction": "response",
|
||
"provider": "groq",
|
||
"model": model,
|
||
"content": raw_response,
|
||
"tokens_prompt": tok_p,
|
||
"tokens_completion": tok_c,
|
||
"site": site_name,
|
||
})
|
||
else:
|
||
print(f"[AutoAdapt] ❌ Groq HTTP {r.status_code}: {r.text[:200]}")
|
||
if _debug:
|
||
_ai_debug_print("AI-DEBUG", f"AUTO-ADAPT GROQ ERROR ({r.status_code})", r.text)
|
||
_ai_log_entry({"call_type": "adapt", "direction": "error", "provider": "groq",
|
||
"model": model, "content": r.text[:500], "status_code": r.status_code, "site": site_name})
|
||
return None
|
||
except Exception as exc:
|
||
print(f"[AutoAdapt] ❌ Groq call failed: {exc}")
|
||
if _debug:
|
||
_ai_log_entry({"call_type": "adapt", "direction": "error", "provider": "groq",
|
||
"model": model, "content": str(exc), "site": site_name})
|
||
return None
|
||
|
||
elif provider == "ollama":
|
||
base_url = _get_config("ai_base_url", "http://localhost:11434").strip()
|
||
if not model: model = "llama3.2:3b"
|
||
try:
|
||
async with httpx.AsyncClient(timeout=120) as client:
|
||
r = await client.post(
|
||
f"{base_url.rstrip('/')}/api/generate",
|
||
json={"model": model, "prompt": prompt, "stream": False},
|
||
)
|
||
if r.status_code == 200:
|
||
rj = r.json()
|
||
raw_response = rj.get("response", "").strip()
|
||
tok_p = rj.get("prompt_eval_count")
|
||
tok_c = rj.get("eval_count")
|
||
if _debug:
|
||
_ai_debug_print("AI-DEBUG", f"AUTO-ADAPT OLLAMA RESPONSE (tokens: prompt={tok_p} completion={tok_c})", raw_response)
|
||
_ai_log_entry({
|
||
"call_type": "adapt",
|
||
"direction": "response",
|
||
"provider": "ollama",
|
||
"model": model,
|
||
"content": raw_response,
|
||
"tokens_prompt": tok_p,
|
||
"tokens_completion": tok_c,
|
||
"site": site_name,
|
||
})
|
||
else:
|
||
print(f"[AutoAdapt] ❌ Ollama HTTP {r.status_code}")
|
||
if _debug:
|
||
_ai_debug_print("AI-DEBUG", f"AUTO-ADAPT OLLAMA ERROR ({r.status_code})", r.text)
|
||
_ai_log_entry({"call_type": "adapt", "direction": "error", "provider": "ollama",
|
||
"model": model, "content": r.text[:500], "status_code": r.status_code, "site": site_name})
|
||
return None
|
||
except httpx.ConnectError:
|
||
print(f"[AutoAdapt] ❌ Ollama not reachable at {base_url}")
|
||
if _debug:
|
||
_ai_log_entry({"call_type": "adapt", "direction": "error", "provider": "ollama",
|
||
"model": model, "content": f"Connection refused at {base_url}", "site": site_name})
|
||
return None
|
||
except Exception as exc:
|
||
print(f"[AutoAdapt] ❌ Ollama call failed: {exc}")
|
||
if _debug:
|
||
_ai_log_entry({"call_type": "adapt", "direction": "error", "provider": "ollama",
|
||
"model": model, "content": str(exc), "site": site_name})
|
||
return None
|
||
else:
|
||
print("[AutoAdapt] ⚠️ No AI provider configured.")
|
||
return None
|
||
|
||
# Extract JSON from response — multi-strategy, most-to-least strict
|
||
def _extract_json(text: str):
|
||
# Strategy 1: direct parse (AI returned pure JSON)
|
||
try:
|
||
obj = json.loads(text)
|
||
if isinstance(obj, dict) and "container" in obj:
|
||
return obj
|
||
except Exception:
|
||
pass
|
||
# Strategy 2: strip markdown code fence (```json ... ``` or ``` ... ```)
|
||
stripped = re.sub(r'^```(?:json)?\s*', '', text.strip(), flags=re.IGNORECASE)
|
||
stripped = re.sub(r'\s*```$', '', stripped.strip())
|
||
try:
|
||
obj = json.loads(stripped)
|
||
if isinstance(obj, dict) and "container" in obj:
|
||
return obj
|
||
except Exception:
|
||
pass
|
||
# Strategy 3: find the outermost {...} block that contains "container"
|
||
brace_start = text.find('{')
|
||
if brace_start != -1:
|
||
depth, end = 0, -1
|
||
for i, ch in enumerate(text[brace_start:], brace_start):
|
||
if ch == '{': depth += 1
|
||
elif ch == '}':
|
||
depth -= 1
|
||
if depth == 0:
|
||
end = i + 1
|
||
break
|
||
if end != -1:
|
||
try:
|
||
obj = json.loads(text[brace_start:end])
|
||
if isinstance(obj, dict) and "container" in obj:
|
||
return obj
|
||
except Exception:
|
||
pass
|
||
# Strategy 4: regex fallback — any {...} block mentioning "container"
|
||
m = re.search(r'\{[^{}]{0,2000}"container"[^{}]{0,2000}\}', text, re.DOTALL)
|
||
if m:
|
||
try:
|
||
return json.loads(m.group())
|
||
except Exception:
|
||
pass
|
||
return None
|
||
|
||
data = _extract_json(raw_response)
|
||
if data is None:
|
||
print(f"[AutoAdapt] ❌ No JSON found in AI response: {raw_response[:300]}")
|
||
if _debug:
|
||
_ai_log_entry({"call_type": "adapt", "direction": "error", "provider": provider,
|
||
"model": model, "content": f"No JSON found in: {raw_response[:400]}", "site": site_name})
|
||
return None
|
||
print(f"[AutoAdapt] ✅ AI returned selectors: {data}")
|
||
return data
|
||
|
||
|
||
async def _validate_selectors(page, sel_dict: dict) -> tuple[float, int, float, float, str]:
|
||
"""
|
||
Test generated selectors live on the current page.
|
||
Returns (confidence 0-100, container_count, title_rate, price_rate, notes).
|
||
"""
|
||
container_sel = (sel_dict.get("container") or "").strip()
|
||
title_sel = (sel_dict.get("title_sel") or "").strip()
|
||
price_sel = (sel_dict.get("price_sel") or "").strip()
|
||
link_sel = (sel_dict.get("link_sel") or "").strip()
|
||
|
||
if not container_sel:
|
||
return 0.0, 0, 0.0, 0.0, "No container selector returned by AI"
|
||
|
||
try:
|
||
containers = await page.query_selector_all(container_sel)
|
||
except Exception as exc:
|
||
return 0.0, 0, 0.0, 0.0, f"Container selector error: {exc}"
|
||
|
||
count = len(containers)
|
||
if count < 2:
|
||
return 5.0, count, 0.0, 0.0, f"Container '{container_sel}' matched only {count} elements"
|
||
|
||
# Sample up to 10 containers for field testing
|
||
sample = containers[:min(10, count)]
|
||
title_hits = price_hits = link_hits = 0
|
||
|
||
for el in sample:
|
||
try:
|
||
if title_sel:
|
||
t_el = await el.query_selector(title_sel)
|
||
if t_el:
|
||
txt = (await t_el.inner_text()).strip()
|
||
if len(txt) > 3:
|
||
title_hits += 1
|
||
if price_sel:
|
||
p_el = await el.query_selector(price_sel)
|
||
if p_el:
|
||
txt = (await p_el.inner_text()).strip()
|
||
if any(c.isdigit() for c in txt):
|
||
price_hits += 1
|
||
if link_sel:
|
||
if link_sel == "self":
|
||
tag = await el.evaluate("e => e.tagName")
|
||
href = await el.evaluate("e => e.href || e.getAttribute('href') || ''")
|
||
if href:
|
||
link_hits += 1
|
||
else:
|
||
l_el = await el.query_selector(link_sel)
|
||
href = await l_el.evaluate("e => e.href || e.getAttribute('href') || ''") if l_el else ""
|
||
if href:
|
||
link_hits += 1
|
||
except Exception:
|
||
continue
|
||
|
||
n = len(sample)
|
||
title_rate = round(title_hits / n * 100, 1) if n else 0
|
||
price_rate = round(price_hits / n * 100, 1) if n else 0
|
||
link_rate = round(link_hits / n * 100, 1) if n else 0
|
||
|
||
# Confidence formula: weighted score
|
||
# Container count matters most, then title, link, price
|
||
count_score = min(40, (count / 20) * 40) # up to 40 pts for 20+ containers
|
||
title_score = (title_rate / 100) * 30 # up to 30 pts
|
||
link_score = (link_rate / 100) * 20 # up to 20 pts
|
||
price_score = (price_rate / 100) * 10 # up to 10 pts
|
||
confidence = round(count_score + title_score + link_score + price_score, 1)
|
||
|
||
notes = (
|
||
f"Container: {count} items | Title: {title_rate}% | "
|
||
f"Price: {price_rate}% | Link: {link_rate}%"
|
||
)
|
||
print(f"[AutoAdapt] 📊 Validation — {notes} → confidence {confidence}")
|
||
return confidence, count, title_rate, price_rate, notes
|
||
|
||
|
||
async def _extract_with_selectors(page, ss: "SiteSelectors") -> list[dict]:
|
||
"""
|
||
Use stored AI selectors to extract listing data from the current page.
|
||
Returns a list of row dicts compatible with the existing scrape pipeline.
|
||
Falls back to empty list on any error (caller then uses JS_EXTRACT fallback).
|
||
"""
|
||
try:
|
||
containers = await page.query_selector_all(ss.container_sel)
|
||
if not containers:
|
||
return []
|
||
rows = []
|
||
for el in containers[:60]:
|
||
try:
|
||
# Title
|
||
title = ""
|
||
if ss.title_sel:
|
||
t_el = await el.query_selector(ss.title_sel)
|
||
if t_el:
|
||
title = (await t_el.inner_text()).strip()
|
||
if not title:
|
||
title = (await el.inner_text()).strip()[:200]
|
||
|
||
# Price
|
||
price_text = ""
|
||
if ss.price_sel:
|
||
p_el = await el.query_selector(ss.price_sel)
|
||
if p_el:
|
||
price_text = (await p_el.inner_text()).strip()
|
||
|
||
# Time
|
||
time_text = ""
|
||
if ss.time_sel:
|
||
tm_el = await el.query_selector(ss.time_sel)
|
||
if tm_el:
|
||
time_text = (await tm_el.inner_text()).strip()
|
||
|
||
# Link
|
||
href = ""
|
||
if ss.link_sel == "self":
|
||
href = await el.evaluate("e => e.href || e.getAttribute('href') || ''")
|
||
elif ss.link_sel:
|
||
l_el = await el.query_selector(ss.link_sel)
|
||
if l_el:
|
||
href = await l_el.evaluate("e => e.href || e.getAttribute('href') || ''")
|
||
if not href:
|
||
# Last resort — find any <a> in container
|
||
l_el = await el.query_selector("a[href]")
|
||
if l_el:
|
||
href = await l_el.evaluate("e => e.href || ''")
|
||
|
||
if title and len(title) >= 5:
|
||
rows.append({"title": title, "price_text": price_text,
|
||
"time_text": time_text, "href": href})
|
||
except Exception:
|
||
continue
|
||
return rows
|
||
except Exception as exc:
|
||
print(f"[AutoAdapt] ⚠️ _extract_with_selectors failed: {exc}")
|
||
return []
|
||
|
||
|
||
async def _auto_dismiss_popups(page) -> bool:
|
||
"""
|
||
Attempt to auto-click cookie consent, GDPR banners, age-gate
|
||
and terms-of-service overlays so the underlying page content
|
||
is visible before HTML extraction.
|
||
|
||
Tries known consent-framework selectors first (fast + precise),
|
||
then falls back to visible buttons whose text matches common
|
||
consent phrases. Silent on failure — never raises.
|
||
Returns True if something was clicked.
|
||
"""
|
||
# ── Known consent-framework CSS selectors ─────────────────────────────────
|
||
KNOWN_SELS = [
|
||
"#onetrust-accept-btn-handler", # OneTrust (very widespread)
|
||
"#CybotCookiebotDialogBodyButtonAccept", # Cookiebot
|
||
"#cookieConsentAcceptButton",
|
||
"#cookie-notice-accept-button",
|
||
".cookie-accept-button",
|
||
".js-cookie-accept",
|
||
".cc-btn.cc-allow", # Cookie Consent (osano)
|
||
"[data-action='accept-cookies']",
|
||
"[data-cookiebanner='accept_button']",
|
||
"[aria-label*='accept' i]",
|
||
"[aria-label*='agree' i]",
|
||
"button#accept-all",
|
||
"button.accept-all",
|
||
"#gdpr-cookie-accept",
|
||
".gdpr-accept-btn",
|
||
"#age-gate-submit", # age gates
|
||
"button[data-testid='cookie-policy-dialog-accept-button']",
|
||
]
|
||
for sel in KNOWN_SELS:
|
||
try:
|
||
loc = page.locator(sel).first
|
||
if await loc.is_visible(timeout=400):
|
||
await loc.click(timeout=1000)
|
||
await asyncio.sleep(0.6)
|
||
return True
|
||
except Exception:
|
||
continue
|
||
|
||
# ── Text-based fallback for any visible button ─────────────────────────────
|
||
ACCEPT_TEXTS = [
|
||
"Accept all", "Accept All", "Accept Cookies", "Accept cookies",
|
||
"I Accept", "I accept", "I Agree", "I agree", "Agree",
|
||
"Accept", "Allow all", "Allow All", "Allow", "Got it",
|
||
"OK", "Ok", "Confirm", "Continue", "I understand",
|
||
"Dismiss", "Close", "I am 18+", "Enter site",
|
||
]
|
||
for text in ACCEPT_TEXTS:
|
||
try:
|
||
btn = page.get_by_role("button", name=text, exact=False).first
|
||
if await btn.is_visible(timeout=300):
|
||
await btn.click(timeout=1000)
|
||
await asyncio.sleep(0.6)
|
||
return True
|
||
except Exception:
|
||
continue
|
||
|
||
return False
|
||
|
||
|
||
async def adapt_site_now(site_id: int) -> dict:
|
||
"""
|
||
Full AI adaptation pipeline for one site.
|
||
Launches a temporary browser (reusing saved login profile if one exists
|
||
so that Cloudflare/session-gated sites work), auto-dismisses cookie/terms
|
||
popups, navigates using a test keyword, extracts + cleans HTML, calls AI
|
||
for selectors, validates them, stores result in SiteSelectors table.
|
||
|
||
Returns a status dict with confidence score and notes.
|
||
Called both from the API endpoint and from the self-healer.
|
||
Works with both Groq (online, best quality) and Ollama (local, unlimited).
|
||
"""
|
||
db = SessionLocal()
|
||
try:
|
||
site = db.query(TargetSite).filter(TargetSite.id == site_id).first()
|
||
if not site:
|
||
return {"error": "Site not found"}
|
||
kw_row = db.query(Keyword).filter(Keyword.term != "").first()
|
||
test_term = kw_row.term if kw_row else "laptop"
|
||
site_name = site.name
|
||
url_template = site.url_template
|
||
search_sel = site.search_selector or ""
|
||
finally:
|
||
db.close()
|
||
|
||
is_direct = "{keyword}" in url_template
|
||
visit_url = url_template.replace("{keyword}", test_term.replace(" ", "+")) if is_direct else url_template
|
||
print(f"[AutoAdapt] 🚀 Starting adaptation for '{site_name}' → {visit_url}")
|
||
|
||
provider = _get_config("ai_provider", "groq").strip().lower()
|
||
if provider == "none":
|
||
return {"error": "AI provider is set to none — configure Groq or Ollama in Settings first"}
|
||
|
||
# Check for a saved browser profile (created by the 🔑 Login button).
|
||
# If one exists we use launch_persistent_context() so the saved Cloudflare
|
||
# cookies / login session carry over automatically — no manual steps needed.
|
||
_site_slug = re.sub(r"[^\w]", "_", site_name.lower())[:20]
|
||
_profile_dir = os.path.join(os.path.dirname(__file__), ".browser_profiles", _site_slug)
|
||
_use_profile = os.path.isdir(_profile_dir) and bool(os.listdir(_profile_dir))
|
||
if _use_profile:
|
||
print(f"[AutoAdapt] 🔐 Saved browser profile found — using session cookies for '{site_name}'")
|
||
else:
|
||
print(f"[AutoAdapt] 🌐 No saved profile — launching fresh browser for '{site_name}'")
|
||
|
||
_LAUNCH_ARGS = [
|
||
"--no-sandbox", "--disable-dev-shm-usage",
|
||
"--disable-blink-features=AutomationControlled",
|
||
"--disable-background-timer-throttling",
|
||
"--disable-renderer-backgrounding",
|
||
]
|
||
|
||
# ── CF check JS (reused in both headless and headful passes) ──────────────
|
||
_CF_JS = """() => !!(
|
||
document.querySelector('#challenge-stage') ||
|
||
document.querySelector('#cf-please-wait') ||
|
||
document.querySelector('.cf-browser-verification') ||
|
||
document.title.includes('Just a moment') ||
|
||
document.title.includes('Checking your browser')
|
||
)"""
|
||
|
||
sel_dict = None
|
||
confidence = count = 0
|
||
title_rate = price_rate = 0.0
|
||
notes = ""
|
||
_adapt_error = None
|
||
|
||
try:
|
||
async with async_playwright() as pw:
|
||
browser_label, exe = _resolve_browser()
|
||
profile = random.choice(_agent_profiles)
|
||
|
||
# Two-pass loop: headless first (fast, invisible), then headful if CF fires.
|
||
# CF is dramatically more lenient with visible browsers; the stealth patches
|
||
# plus a real on-screen window passes the check in the vast majority of cases.
|
||
# If headful ALSO gets CF and a CAPTCHA solver is configured, Turnstile is
|
||
# solved automatically via API — zero manual steps required.
|
||
for _headless in [True, False]:
|
||
if not _headless:
|
||
print(f"[AutoAdapt] 🖥️ Retrying '{site_name}' with visible browser "
|
||
f"(window will appear briefly then close automatically)...")
|
||
browser = None
|
||
ctx = None
|
||
|
||
# ── Launch browser ─────────────────────────────────────────────
|
||
if _use_profile:
|
||
_pctx_kw: dict = {
|
||
"headless": _headless,
|
||
"args": _LAUNCH_ARGS,
|
||
"user_agent": profile["ua"],
|
||
"viewport": {"width": profile["viewport"][0], "height": profile["viewport"][1]},
|
||
"locale": profile["locale"],
|
||
}
|
||
if exe: _pctx_kw["executable_path"] = exe
|
||
ctx = await pw.chromium.launch_persistent_context(_profile_dir, **_pctx_kw)
|
||
else:
|
||
_lk: dict = {"headless": _headless, "args": _LAUNCH_ARGS}
|
||
if exe: _lk["executable_path"] = exe
|
||
browser = await pw.chromium.launch(**_lk)
|
||
ctx = await browser.new_context(
|
||
user_agent=profile["ua"],
|
||
viewport={"width": profile["viewport"][0], "height": profile["viewport"][1]},
|
||
locale=profile["locale"],
|
||
)
|
||
|
||
await ctx.add_init_script(_build_stealth_script(profile))
|
||
page = await ctx.new_page()
|
||
await page.route("**/*.{png,jpg,jpeg,gif,webp,woff,woff2,ttf,svg,ico}", lambda r: r.abort())
|
||
|
||
try:
|
||
# ── Navigate ───────────────────────────────────────────────
|
||
await page.goto(visit_url, timeout=60_000, wait_until="domcontentloaded")
|
||
|
||
# Auto-dismiss cookie/terms popups before search box interaction
|
||
if await _auto_dismiss_popups(page):
|
||
print(f"[AutoAdapt] 🍪 Popup auto-dismissed for '{site_name}'")
|
||
|
||
# ── Cloudflare detection ───────────────────────────────────
|
||
_on_cf = await page.evaluate(_CF_JS)
|
||
|
||
if _on_cf:
|
||
if _headless:
|
||
# Headless blocked — retry non-headless automatically
|
||
print(f"[AutoAdapt] 🚧 CF detected (headless) — "
|
||
f"retrying with visible browser...")
|
||
continue # finally closes browser; next iter uses headless=False
|
||
|
||
# Headful also blocked — try Turnstile CAPTCHA solver
|
||
_solver = _get_config("captcha_solver", "none").lower()
|
||
_api_key = _get_config("captcha_api_key", "").strip()
|
||
if _solver in ("2captcha", "capsolver") and _api_key:
|
||
print(f"[AutoAdapt] 🔐 CF blocked visible browser — "
|
||
f"trying Turnstile solver ({_solver})...")
|
||
_ts_ok = await _solve_cf_turnstile(page, _solver, _api_key)
|
||
if _ts_ok:
|
||
# Wait for CF to redirect to the actual page
|
||
await page.wait_for_load_state("domcontentloaded", timeout=20_000)
|
||
await asyncio.sleep(1)
|
||
_on_cf = False # fall through to extraction below
|
||
else:
|
||
_adapt_error = (
|
||
"Cloudflare Turnstile could not be solved automatically. "
|
||
"Check CAPTCHA solver config and API key balance in Settings."
|
||
)
|
||
break
|
||
else:
|
||
_adapt_error = (
|
||
"Cloudflare blocked adapt in both headless and visible-browser modes. "
|
||
"For fully automatic solving: configure a CAPTCHA solver (2captcha or "
|
||
"CapSolver) in Settings → CAPTCHA. "
|
||
"Or: enable 'Requires Login' for this site and click 🔑 Login once "
|
||
"to save a session manually."
|
||
)
|
||
break
|
||
|
||
if not _on_cf:
|
||
# ── Mode B search ──────────────────────────────────────
|
||
if not is_direct:
|
||
try:
|
||
search_el = await _discover_search_input(page, search_sel, site_name)
|
||
await search_el.fill(test_term)
|
||
await search_el.press("Enter")
|
||
await page.wait_for_load_state("domcontentloaded", timeout=30_000)
|
||
await asyncio.sleep(2)
|
||
await _auto_dismiss_popups(page)
|
||
except Exception as se:
|
||
print(f"[AutoAdapt] ⚠️ Search interaction failed: {se} — using homepage HTML")
|
||
|
||
await asyncio.sleep(1.5) # let lazy-loaded content settle
|
||
|
||
raw_html = await page.content()
|
||
cleaned = _clean_html_for_ai(raw_html)
|
||
print(f"[AutoAdapt] 📄 Cleaned HTML: {len(cleaned)} chars → sending to {provider}")
|
||
|
||
sel_dict = await _generate_selectors_ai(cleaned, site_name)
|
||
if not sel_dict:
|
||
_adapt_error = "AI did not return usable selectors. Check provider config."
|
||
break
|
||
|
||
confidence, count, title_rate, price_rate, notes = \
|
||
await _validate_selectors(page, sel_dict)
|
||
break # ✅ success
|
||
|
||
finally:
|
||
if browser:
|
||
await browser.close()
|
||
elif ctx:
|
||
await ctx.close()
|
||
|
||
except Exception as exc:
|
||
print(f"[AutoAdapt] ❌ Browser error: {exc}")
|
||
return {"error": f"Browser launch failed: {exc}"}
|
||
|
||
if _adapt_error:
|
||
return {"error": _adapt_error}
|
||
if not sel_dict:
|
||
return {"error": "AI did not return usable selectors. Check provider config."}
|
||
|
||
# Persist to DB
|
||
db2 = SessionLocal()
|
||
try:
|
||
row = db2.query(SiteSelectors).filter(SiteSelectors.site_id == site_id).first()
|
||
if not row:
|
||
row = SiteSelectors(site_id=site_id)
|
||
db2.add(row)
|
||
row.container_sel = (sel_dict.get("container") or "")[:500]
|
||
row.title_sel = (sel_dict.get("title_sel") or "")[:500]
|
||
row.price_sel = (sel_dict.get("price_sel") or "")[:500]
|
||
row.time_sel = (sel_dict.get("time_sel") or "")[:500]
|
||
row.link_sel = (sel_dict.get("link_sel") or "")[:500]
|
||
row.next_page_sel = (sel_dict.get("next_page_sel") or "")[:500]
|
||
row.confidence = confidence
|
||
row.container_count = count
|
||
row.title_rate = title_rate
|
||
row.price_rate = price_rate
|
||
row.provider = provider
|
||
row.generated_at = datetime.now()
|
||
row.last_tested_at = datetime.now()
|
||
row.stale = False
|
||
row.notes = notes[:1000] if notes else ""
|
||
db2.flush()
|
||
db2.commit()
|
||
print(f"[AutoAdapt] ✅ Selectors saved for '{site_name}' — confidence {confidence}")
|
||
finally:
|
||
db2.close()
|
||
|
||
return {
|
||
"status": "done",
|
||
"site_name": site_name,
|
||
"confidence": confidence,
|
||
"container_count": count,
|
||
"title_rate": title_rate,
|
||
"price_rate": price_rate,
|
||
"notes": notes,
|
||
"selectors": sel_dict,
|
||
"provider": provider,
|
||
}
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
def _extract_price_and_currency(text: str) -> tuple[Optional[float], str]:
|
||
"""
|
||
Parse a price string like "$45.00", "CA$1,200", "£700.50",
|
||
"45.00 USD", "EUR 120", "Current Bid: $50" etc.
|
||
Returns (float_amount, "ISO_CODE") or (None, "").
|
||
Handles comma-separated thousands (1,200.00) and
|
||
period-as-thousands (1.200,00) formats.
|
||
"""
|
||
if not text:
|
||
return None, ""
|
||
|
||
# Detect currency from the text
|
||
currency = ""
|
||
text_upper = text.upper()
|
||
for symbol, code in _CURRENCY_MAP:
|
||
if symbol.upper() in text_upper:
|
||
currency = code
|
||
break
|
||
|
||
# Normalise: remove currency symbols/codes and whitespace
|
||
clean = text
|
||
for symbol, _ in _CURRENCY_MAP:
|
||
clean = clean.replace(symbol, " ")
|
||
clean = clean.strip()
|
||
|
||
# Handle European format: 1.200,50 → 1200.50
|
||
if re.search(r"\d\.\d{3},\d{2}", clean):
|
||
clean = clean.replace(".", "").replace(",", ".")
|
||
|
||
# Strip commas used as thousands separators: 1,200.50 → 1200.50
|
||
clean = clean.replace(",", "")
|
||
|
||
# Find the first number (integer or decimal)
|
||
m = re.search(r"\d+\.?\d*", clean)
|
||
if m:
|
||
try:
|
||
return float(m.group()), currency
|
||
except ValueError:
|
||
pass
|
||
return None, currency
|
||
|
||
|
||
def _format_price(amount: Optional[float], currency: str) -> str:
|
||
"""
|
||
Return display string like "50.00 USD" or "700.50 GBP".
|
||
If amount is None returns "".
|
||
"""
|
||
if amount is None:
|
||
return ""
|
||
cur = currency.strip() if currency else "USD"
|
||
return f"{amount:,.2f} {cur}"
|
||
|
||
|
||
# ── N4 — Currency conversion cache ───────────────────────────────────────────
|
||
_fx_rates: dict[str, float] = {} # base=USD, e.g. {"GBP": 0.79, "EUR": 0.92}
|
||
_fx_fetched_at: float = 0.0 # epoch seconds
|
||
|
||
async def _get_fx_rates() -> dict[str, float]:
|
||
"""
|
||
Fetch daily exchange rates from frankfurter.app (free, no key needed).
|
||
Returns a dict of {currency_code: rate_per_1_USD}.
|
||
Cached for 6 hours to avoid hammering the API.
|
||
"""
|
||
global _fx_rates, _fx_fetched_at
|
||
if _fx_rates and (time.time() - _fx_fetched_at) < 21600: # 6h cache
|
||
return _fx_rates
|
||
try:
|
||
async with httpx.AsyncClient(timeout=10) as client:
|
||
r = await client.get("https://api.frankfurter.app/latest?from=USD")
|
||
if r.status_code == 200:
|
||
data = r.json()
|
||
rates = data.get("rates", {})
|
||
rates["USD"] = 1.0 # base
|
||
_fx_rates = rates
|
||
_fx_fetched_at = time.time()
|
||
print(f"[FX] ✅ Rates updated: {len(rates)} currencies.")
|
||
else:
|
||
print(f"[FX] ⚠️ frankfurter.app returned {r.status_code} — using cached rates.")
|
||
except Exception as exc:
|
||
print(f"[FX] ⚠️ Rate fetch failed: {exc} — using cached rates.")
|
||
return _fx_rates
|
||
|
||
|
||
def _convert_price(price: float, from_currency: str, to_currency: str) -> float | None:
|
||
"""
|
||
Convert a price from one currency to another using cached FX rates.
|
||
Returns None if conversion not possible (unknown currency or no rates loaded).
|
||
"""
|
||
if not price or not from_currency or not to_currency:
|
||
return None
|
||
from_c = from_currency.upper()
|
||
to_c = to_currency.upper()
|
||
if from_c == to_c:
|
||
return round(price, 2)
|
||
if not _fx_rates:
|
||
return None
|
||
# Convert: from_c → USD → to_c
|
||
rate_from = _fx_rates.get(from_c) # how many USD per 1 from_c unit
|
||
rate_to = _fx_rates.get(to_c) # how many USD per 1 to_c unit
|
||
if not rate_from or not rate_to:
|
||
return None
|
||
# price in from_c → price in USD → price in to_c
|
||
price_usd = price / rate_from
|
||
return round(price_usd * rate_to, 2)
|
||
|
||
|
||
def _extract_time_left(text: str) -> str:
|
||
"""
|
||
Parse a time-remaining string into a normalised "Xd Yh Zm" format.
|
||
|
||
Handles inputs like:
|
||
"2 days 4 hours 30 minutes"
|
||
"4h 30m"
|
||
"2d 04:30:00"
|
||
"Ends in 3 days"
|
||
"1 day left"
|
||
"Closing: 02:45" → hours:minutes only
|
||
"23:14:05" → HH:MM:SS
|
||
"Time Left: 0 days 2 hours 15 minutes"
|
||
Returns empty string if nothing parseable found.
|
||
"""
|
||
if not text:
|
||
return ""
|
||
t = text.lower()
|
||
|
||
days = 0
|
||
hours = 0
|
||
mins = 0
|
||
|
||
# "X day(s)"
|
||
m = re.search(r"(\d+)\s*d(?:ay)?s?", t)
|
||
if m:
|
||
days = int(m.group(1))
|
||
|
||
# "X hour(s)" or "Xh"
|
||
m = re.search(r"(\d+)\s*h(?:our)?s?", t)
|
||
if m:
|
||
hours = int(m.group(1))
|
||
|
||
# "X minute(s)" or "Xm"
|
||
m = re.search(r"(\d+)\s*m(?:in(?:ute)?s?)?(?!s)", t)
|
||
if m:
|
||
mins = int(m.group(1))
|
||
|
||
# HH:MM:SS or HH:MM (only if no days/hours/mins found yet)
|
||
if days == 0 and hours == 0 and mins == 0:
|
||
m = re.search(r"(\d{1,2}):(\d{2})(?::(\d{2}))?", t)
|
||
if m:
|
||
hours = int(m.group(1))
|
||
mins = int(m.group(2))
|
||
# seconds ignored
|
||
|
||
if days == 0 and hours == 0 and mins == 0:
|
||
return ""
|
||
|
||
parts = []
|
||
if days: parts.append(f"{days}d")
|
||
if hours: parts.append(f"{hours}h")
|
||
if mins: parts.append(f"{mins}m")
|
||
return " ".join(parts) if parts else ""
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# N2 — CAPTCHA Detection & Solver
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
_CAPTCHA_INDICATORS = [
|
||
"captcha", "cf-challenge", "g-recaptcha", "h-captcha",
|
||
"datadome", "px-captcha", "challenge-form", "robot check",
|
||
"are you human", "verify you are human", "security check",
|
||
"just a moment", "checking your browser", # Cloudflare phrases
|
||
]
|
||
|
||
async def _detect_captcha(page) -> bool:
|
||
"""Return True if the current page appears to show a CAPTCHA challenge."""
|
||
try:
|
||
content = (await page.content()).lower()
|
||
title = (await page.title()).lower()
|
||
for ind in _CAPTCHA_INDICATORS:
|
||
if ind in content or ind in title:
|
||
return True
|
||
except Exception:
|
||
pass
|
||
return False
|
||
|
||
|
||
async def _solve_captcha_2captcha(page, api_key: str) -> bool:
|
||
"""
|
||
Attempt to solve a sitekey-based CAPTCHA (reCAPTCHA/hCaptcha) via 2captcha.
|
||
Injects the g-recaptcha-response token and submits the form.
|
||
Returns True if solved successfully.
|
||
"""
|
||
try:
|
||
# Extract sitekey from page
|
||
sitekey = await page.evaluate("""() => {
|
||
const el = document.querySelector('[data-sitekey]');
|
||
return el ? el.getAttribute('data-sitekey') : null;
|
||
}""")
|
||
if not sitekey:
|
||
print("[CAPTCHA] ⚠️ No sitekey found — cannot auto-solve.")
|
||
return False
|
||
|
||
page_url = page.url
|
||
# Submit task to 2captcha
|
||
async with httpx.AsyncClient(timeout=30) as client:
|
||
r = await client.post("https://2captcha.com/in.php", data={
|
||
"key": api_key, "method": "userrecaptcha",
|
||
"googlekey": sitekey, "pageurl": page_url, "json": 1,
|
||
})
|
||
data = r.json()
|
||
if data.get("status") != 1:
|
||
print(f"[CAPTCHA] ❌ 2captcha submit failed: {data}")
|
||
return False
|
||
task_id = data["request"]
|
||
|
||
# Poll for result (up to 120s)
|
||
for attempt in range(24):
|
||
await asyncio.sleep(5)
|
||
async with httpx.AsyncClient(timeout=15) as client:
|
||
r = await client.get("https://2captcha.com/res.php", params={
|
||
"key": api_key, "action": "get", "id": task_id, "json": 1,
|
||
})
|
||
data = r.json()
|
||
if data.get("status") == 1:
|
||
token = data["request"]
|
||
# Inject token into page
|
||
await page.evaluate(f"""(token) => {{
|
||
document.getElementById('g-recaptcha-response').value = token;
|
||
if (window.___grecaptcha_cfg) {{
|
||
const cbs = Object.values(window.___grecaptcha_cfg.clients || {{}});
|
||
cbs.forEach(c => {{ if (c.l && c.l.l) c.l.l(token); }});
|
||
}}
|
||
}}""", token)
|
||
await asyncio.sleep(1)
|
||
print("[CAPTCHA] ✅ 2captcha token injected.")
|
||
return True
|
||
if data.get("request") != "CAPCHA_NOT_READY":
|
||
print(f"[CAPTCHA] ❌ 2captcha error: {data}")
|
||
return False
|
||
print("[CAPTCHA] ❌ 2captcha timed out after 120s.")
|
||
return False
|
||
except Exception as exc:
|
||
print(f"[CAPTCHA] ❌ Solver error: {exc}")
|
||
return False
|
||
|
||
|
||
async def _solve_captcha_capsolver(page, api_key: str) -> bool:
|
||
"""CapSolver-flavoured solver — same flow, different endpoint."""
|
||
try:
|
||
sitekey = await page.evaluate("""() => {
|
||
const el = document.querySelector('[data-sitekey]');
|
||
return el ? el.getAttribute('data-sitekey') : null;
|
||
}""")
|
||
if not sitekey:
|
||
return False
|
||
async with httpx.AsyncClient(timeout=30) as client:
|
||
r = await client.post("https://api.capsolver.com/createTask", json={
|
||
"clientKey": api_key,
|
||
"task": {"type": "ReCaptchaV2TaskProxyLess",
|
||
"websiteURL": page.url, "websiteKey": sitekey},
|
||
})
|
||
data = r.json()
|
||
if data.get("errorId", 1) != 0:
|
||
print(f"[CAPTCHA] ❌ CapSolver create failed: {data}")
|
||
return False
|
||
task_id = data["taskId"]
|
||
|
||
for _ in range(24):
|
||
await asyncio.sleep(5)
|
||
async with httpx.AsyncClient(timeout=15) as client:
|
||
r = await client.post("https://api.capsolver.com/getTaskResult", json={
|
||
"clientKey": api_key, "taskId": task_id,
|
||
})
|
||
data = r.json()
|
||
if data.get("status") == "ready":
|
||
token = data["solution"]["gRecaptchaResponse"]
|
||
await page.evaluate(f"document.getElementById('g-recaptcha-response').value = arguments[0]", token)
|
||
print("[CAPTCHA] ✅ CapSolver token injected.")
|
||
return True
|
||
if data.get("status") == "failed":
|
||
print(f"[CAPTCHA] ❌ CapSolver failed: {data}")
|
||
return False
|
||
return False
|
||
except Exception as exc:
|
||
print(f"[CAPTCHA] ❌ CapSolver error: {exc}")
|
||
return False
|
||
|
||
|
||
async def _solve_cf_turnstile(page, solver: str, api_key: str) -> bool:
|
||
"""
|
||
Solve a Cloudflare Turnstile managed challenge using 2captcha or CapSolver.
|
||
Extracts the Turnstile sitekey from the CF page, submits to solver API,
|
||
injects the returned token into the hidden form field, and submits.
|
||
Returns True if challenge was submitted successfully (CF redirect follows).
|
||
"""
|
||
try:
|
||
sitekey = await page.evaluate("""() => {
|
||
const el = document.querySelector('[data-sitekey]');
|
||
if (el) return el.getAttribute('data-sitekey');
|
||
// Fallback: parse from inline script source on CF managed challenge page
|
||
const m = document.body.innerHTML.match(/sitekey[\"\\s:=]+[\"']([0-9a-zA-Z_\\-]{10,})/);
|
||
return m ? m[1] : null;
|
||
}""")
|
||
if not sitekey:
|
||
print("[AutoAdapt-CF] ⚠️ Turnstile sitekey not found — cannot auto-solve")
|
||
return False
|
||
|
||
page_url = page.url
|
||
token = None
|
||
|
||
if solver == "2captcha":
|
||
async with httpx.AsyncClient(timeout=30) as client:
|
||
r = await client.post("https://2captcha.com/in.php", data={
|
||
"key": api_key, "method": "turnstile",
|
||
"sitekey": sitekey, "pageurl": page_url, "json": 1,
|
||
})
|
||
data = r.json()
|
||
if data.get("status") != 1:
|
||
print(f"[AutoAdapt-CF] ❌ 2captcha Turnstile submit failed: {data}")
|
||
return False
|
||
task_id = data["request"]
|
||
for _ in range(24):
|
||
await asyncio.sleep(5)
|
||
async with httpx.AsyncClient(timeout=15) as client:
|
||
r = await client.get("https://2captcha.com/res.php", params={
|
||
"key": api_key, "action": "get", "id": task_id, "json": 1,
|
||
})
|
||
data = r.json()
|
||
if data.get("status") == 1:
|
||
token = data["request"]
|
||
break
|
||
if data.get("request") != "CAPCHA_NOT_READY":
|
||
print(f"[AutoAdapt-CF] ❌ 2captcha Turnstile error: {data}")
|
||
return False
|
||
|
||
elif solver == "capsolver":
|
||
async with httpx.AsyncClient(timeout=30) as client:
|
||
r = await client.post("https://api.capsolver.com/createTask", json={
|
||
"clientKey": api_key,
|
||
"task": {"type": "AntiTurnstileTaskProxyLess",
|
||
"websiteURL": page_url, "websiteKey": sitekey},
|
||
})
|
||
data = r.json()
|
||
if data.get("errorId", 1) != 0:
|
||
print(f"[AutoAdapt-CF] ❌ CapSolver Turnstile create failed: {data}")
|
||
return False
|
||
task_id = data["taskId"]
|
||
for _ in range(24):
|
||
await asyncio.sleep(5)
|
||
async with httpx.AsyncClient(timeout=15) as client:
|
||
r = await client.post("https://api.capsolver.com/getTaskResult", json={
|
||
"clientKey": api_key, "taskId": task_id,
|
||
})
|
||
data = r.json()
|
||
if data.get("status") == "ready":
|
||
token = data["solution"]["token"]
|
||
break
|
||
if data.get("status") == "failed":
|
||
print(f"[AutoAdapt-CF] ❌ CapSolver Turnstile failed: {data}")
|
||
return False
|
||
|
||
if not token:
|
||
print("[AutoAdapt-CF] ❌ Turnstile solver timed out after 120s")
|
||
return False
|
||
|
||
# Inject token into the hidden field and submit the CF challenge form
|
||
await page.evaluate("""(token) => {
|
||
const inp = document.querySelector('[name="cf-turnstile-response"]');
|
||
if (inp) inp.value = token;
|
||
const form = document.querySelector('#challenge-form') ||
|
||
document.querySelector('form[action]');
|
||
if (form) form.submit();
|
||
}""", token)
|
||
print("[AutoAdapt-CF] ✅ Turnstile token injected and challenge form submitted")
|
||
return True
|
||
|
||
except Exception as exc:
|
||
print(f"[AutoAdapt-CF] ❌ Turnstile solver error: {exc}")
|
||
return False
|
||
|
||
|
||
async def _handle_captcha(page, site_name: str) -> bool:
|
||
"""
|
||
Unified CAPTCHA handler. Reads solver preference from Config.
|
||
Returns True if solved (or no CAPTCHA), False if blocked.
|
||
"""
|
||
if not await _detect_captcha(page):
|
||
return True # no captcha — all clear
|
||
|
||
solver = _get_config("captcha_solver", "none").lower()
|
||
api_key = _get_config("captcha_api_key", "")
|
||
|
||
print(f"[CAPTCHA] 🔒 CAPTCHA detected on {site_name} — solver={solver}")
|
||
await send_alert(
|
||
f"🔒 <b>CAPTCHA detected</b> on <b>{site_name}</b>\n"
|
||
f"Solver: {solver}. {'Attempting auto-solve…' if solver != 'none' else '⚠️ No solver configured — set one in Settings.'}"
|
||
)
|
||
|
||
if solver == "none" or not api_key:
|
||
return False
|
||
|
||
if solver == "2captcha":
|
||
return await _solve_captcha_2captcha(page, api_key)
|
||
if solver == "capsolver":
|
||
return await _solve_captcha_capsolver(page, api_key)
|
||
return False
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# N3 — Block / Rate-limit Detection & Site Health Tracking
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
_BLOCK_PHRASES = [
|
||
"access denied", "403 forbidden", "you have been blocked",
|
||
"your ip", "rate limit", "too many requests", "bot detected",
|
||
"automated access", "suspicious activity", "security check required",
|
||
"enable javascript", "please enable cookies",
|
||
]
|
||
|
||
async def _detect_block(page) -> bool:
|
||
"""Return True if the current page is a block/ban/rate-limit page."""
|
||
try:
|
||
status_code = getattr(page, "_last_status", None)
|
||
if status_code and status_code in (403, 429, 503):
|
||
return True
|
||
content = (await page.content()).lower()
|
||
title = (await page.title()).lower()
|
||
for phrase in _BLOCK_PHRASES:
|
||
if phrase in content or phrase in title:
|
||
return True
|
||
except Exception:
|
||
pass
|
||
return False
|
||
|
||
|
||
def _record_site_success(site_id: int) -> None:
|
||
"""Reset error counters and record last_success_at after a successful scrape."""
|
||
db = SessionLocal()
|
||
try:
|
||
s = db.query(TargetSite).filter(TargetSite.id == site_id).first()
|
||
if s:
|
||
s.consecutive_failures = 0
|
||
s.last_success_at = datetime.now()
|
||
s.cooldown_until = None
|
||
db.commit()
|
||
except Exception as exc:
|
||
print(f"[Health] ⚠️ Could not record success for site {site_id}: {exc}")
|
||
finally:
|
||
db.close()
|
||
|
||
|
||
def _record_site_error(site_id: int, error_msg: str) -> None:
|
||
"""
|
||
Increment error counters. If consecutive failures exceed the threshold,
|
||
put the site into cooldown and optionally disable it entirely.
|
||
Fires a Telegram alert on first block detection.
|
||
"""
|
||
db = SessionLocal()
|
||
try:
|
||
s = db.query(TargetSite).filter(TargetSite.id == site_id).first()
|
||
if not s:
|
||
return
|
||
s.last_error = error_msg[:500]
|
||
s.error_count = (s.error_count or 0) + 1
|
||
s.consecutive_failures = (s.consecutive_failures or 0) + 1
|
||
|
||
threshold = int(_get_config("site_auto_disable_after", "5"))
|
||
|
||
if s.consecutive_failures >= threshold and threshold > 0:
|
||
# Put into 30-minute cooldown
|
||
from datetime import timedelta
|
||
s.cooldown_until = datetime.now() + timedelta(minutes=30)
|
||
msg = (
|
||
f"⛔ <b>Site blocked/failing: {s.name}</b>\n"
|
||
f"Consecutive failures: {s.consecutive_failures}\n"
|
||
f"Last error: {error_msg[:200]}\n"
|
||
f"Site placed in 30-minute cooldown."
|
||
)
|
||
print(f"[Health] ⛔ {s.name} in cooldown after {s.consecutive_failures} failures.")
|
||
asyncio.create_task(send_alert(msg))
|
||
|
||
db.commit()
|
||
except Exception as exc:
|
||
print(f"[Health] ⚠️ Could not record error for site {site_id}: {exc}")
|
||
finally:
|
||
db.close()
|
||
|
||
|
||
async def _discover_search_input(page, css_fallback: str, site_name: str):
|
||
"""
|
||
Search Discovery — finds the search input on any page using Playwright's
|
||
semantic locators before falling back to the CSS selector stored in the DB.
|
||
|
||
Probe order (first match wins):
|
||
1. get_by_role("textbox", name="search") — ARIA role + accessible name
|
||
2. get_by_role("searchbox") — explicit ARIA searchbox role
|
||
3. get_by_placeholder("Search …") — common English placeholder variants
|
||
4. get_by_label("Search …") — <label> or aria-label variants
|
||
5. css_fallback — the site's stored search_selector
|
||
|
||
Each semantic probe has a 5-second timeout so a missing element fails fast
|
||
and the next strategy is tried immediately. The CSS fallback gets the full
|
||
15-second wait_for_selector treatment so slow navbars still load.
|
||
|
||
Returns the located Playwright Locator/ElementHandle on success, or raises
|
||
a RuntimeError with a descriptive message if every strategy fails (so the
|
||
caller can bail cleanly and log the exact reason).
|
||
"""
|
||
|
||
# ── Strategy table ────────────────────────────────────────────────────────
|
||
# Each entry is (label, coroutine_factory) where the factory is called with
|
||
# no args and returns an awaitable that resolves to the element or raises.
|
||
TIMEOUT_MS = 5_000 # per-strategy timeout for semantic probes
|
||
|
||
async def try_role_search():
|
||
loc = page.get_by_role("textbox", name=re.compile(r"search", re.I))
|
||
await loc.wait_for(timeout=TIMEOUT_MS)
|
||
return loc
|
||
|
||
async def try_role_searchbox():
|
||
loc = page.get_by_role("searchbox")
|
||
await loc.wait_for(timeout=TIMEOUT_MS)
|
||
return loc
|
||
|
||
async def try_placeholder():
|
||
# Cover "Search", "Search…", "Search for items", "Search items", etc.
|
||
loc = page.get_by_placeholder(re.compile(r"search", re.I))
|
||
await loc.wait_for(timeout=TIMEOUT_MS)
|
||
return loc
|
||
|
||
async def try_label():
|
||
loc = page.get_by_label(re.compile(r"search", re.I))
|
||
await loc.wait_for(timeout=TIMEOUT_MS)
|
||
return loc
|
||
|
||
strategies = [
|
||
("semantic:role[textbox+name=search]", try_role_search),
|
||
("semantic:role[searchbox]", try_role_searchbox),
|
||
("semantic:placeholder[search]", try_placeholder),
|
||
("semantic:label[search]", try_label),
|
||
]
|
||
|
||
for strategy_name, factory in strategies:
|
||
try:
|
||
element = await factory()
|
||
print(
|
||
f"[Scraper] 🔍 {site_name}: search input located via "
|
||
f"{strategy_name}"
|
||
)
|
||
return element
|
||
except Exception:
|
||
# Probe failed — try the next strategy silently
|
||
continue
|
||
|
||
# ── CSS fallback ──────────────────────────────────────────────────────────
|
||
if css_fallback:
|
||
try:
|
||
print(
|
||
f"[Scraper] 🔍 {site_name}: semantic probes exhausted, "
|
||
f"falling back to CSS selector '{css_fallback}'"
|
||
)
|
||
await page.wait_for_selector(css_fallback, timeout=15_000)
|
||
return page.locator(css_fallback)
|
||
except Exception as css_exc:
|
||
raise RuntimeError(
|
||
f"CSS fallback '{css_fallback}' also failed: {css_exc}"
|
||
) from css_exc
|
||
|
||
raise RuntimeError(
|
||
"All semantic probes failed and no CSS fallback was provided. "
|
||
"Add a search_selector for this site in the Target Sites tab."
|
||
)
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# N14 — Login Session Helper
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
async def _check_login_status(page, site: TargetSite) -> bool:
|
||
"""
|
||
Returns True if the browser session appears to be logged into this site.
|
||
Detection strategy (in order):
|
||
1. If site.login_check_selector is set: element present = logged in.
|
||
2. If site.login_url is set: navigate to login page, check if we're
|
||
redirected away (already logged in) or stay on login page (not logged in).
|
||
3. If neither configured: assume logged in (don't block scraping).
|
||
"""
|
||
if not site.requires_login or not site.login_enabled:
|
||
return True # login not required for this site
|
||
|
||
try:
|
||
if site.login_check_selector and site.login_check_selector.strip():
|
||
el = await page.query_selector(site.login_check_selector)
|
||
logged_in = el is not None
|
||
if not logged_in:
|
||
print(f"[Login] ⚠️ {site.name}: not logged in (selector absent).")
|
||
return logged_in
|
||
|
||
if site.login_url and site.login_url.strip():
|
||
before = page.url
|
||
await page.goto(site.login_url, timeout=30_000, wait_until="domcontentloaded")
|
||
after = page.url
|
||
logged_in = after != site.login_url # redirected away = logged in
|
||
if not logged_in:
|
||
print(f"[Login] ⚠️ {site.name}: still on login page — not logged in.")
|
||
return logged_in
|
||
|
||
except Exception as exc:
|
||
print(f"[Login] ⚠️ Login check failed for {site.name}: {exc}")
|
||
|
||
return True # can't determine — proceed anyway
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# N5 — Pagination Helper
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
_NEXT_PAGE_SELS = [
|
||
"a[aria-label='Next']",
|
||
"a[aria-label='Next page']",
|
||
"a[rel='next']",
|
||
"a.pagination-next",
|
||
"li.next > a",
|
||
"button[aria-label='Next page']",
|
||
".s-pagination-next", # eBay
|
||
"[data-testid='pagination-next']",
|
||
"a:has-text('Next')",
|
||
"a:has-text('›')",
|
||
"a:has-text('»')",
|
||
]
|
||
|
||
async def _go_next_page(page) -> bool:
|
||
"""
|
||
Attempt to click the "Next page" button. Returns True if navigation succeeded.
|
||
Tries a list of common selectors across different auction platforms.
|
||
"""
|
||
for sel in _NEXT_PAGE_SELS:
|
||
try:
|
||
el = await page.query_selector(sel)
|
||
if el:
|
||
is_disabled = await el.get_attribute("aria-disabled") or \
|
||
await el.get_attribute("disabled") or ""
|
||
if "true" in str(is_disabled).lower() or "disabled" in str(is_disabled).lower():
|
||
continue
|
||
await el.click()
|
||
await page.wait_for_load_state("networkidle", timeout=30_000)
|
||
return True
|
||
except Exception:
|
||
continue
|
||
return False
|
||
|
||
|
||
# ── Page-level lot-card extractor (used by scrape_site + pagination) ──────────
|
||
# Returns a list of {title, price_text, time_text, location, href, images[]}
|
||
# Extracted once and reused so the pagination path shares the same logic.
|
||
JS_EXTRACT = """() => {
|
||
|
||
// ─────────────────────────────────────────────────────────────────────────
|
||
// CARD-ANCHORED EXTRACTION
|
||
// Strategy: find each listing's "card" container first, then look for
|
||
// title / price / time / link / images INSIDE that same card.
|
||
// For Angular/React sites where price+time live outside the card (e.g. HiBid),
|
||
// we walk UP through ancestors until we reach a container that holds all
|
||
// four elements together.
|
||
// ─────────────────────────────────────────────────────────────────────────
|
||
|
||
// ── 1. Find card containers ───────────────────────────────────────────────
|
||
const CARD_SELS = [
|
||
'li.s-item', // eBay
|
||
'div.lot-card', // ShopGoodwill
|
||
'.item-card',
|
||
'article[class*="lot"]',
|
||
'article[class*="item"]',
|
||
'div[class*="lot-item"]',
|
||
'div[class*="lot-card"]',
|
||
'div[class*="listing-item"]',
|
||
'div[class*="result-item"]',
|
||
'[data-listing-id]',
|
||
'[data-lot-id]',
|
||
'[data-item-id]',
|
||
];
|
||
|
||
let cards = [];
|
||
for (const s of CARD_SELS) {
|
||
try {
|
||
const els = document.querySelectorAll(s);
|
||
if (els.length >= 2) { cards = Array.from(els); break; }
|
||
} catch(e) {}
|
||
}
|
||
|
||
// ── 2. Helpers ────────────────────────────────────────────────────────────
|
||
|
||
// Query inside a root, trying selectors in order
|
||
const qIn = (root, sels) => {
|
||
for (const s of sels) {
|
||
try { const el = root.querySelector(s); if (el) return el; }
|
||
catch(e) {}
|
||
}
|
||
return null;
|
||
};
|
||
|
||
// ── Apollo Client cache helper (HiBid & other GraphQL SPAs) ──────────────
|
||
// Builds a map of lotId → [imageUrl, ...] from the Apollo in-memory cache.
|
||
// Works on both search-results pages and detail pages.
|
||
// Returns empty map if Apollo is not present.
|
||
const _apolloImgMap = (() => {
|
||
const map = {};
|
||
try {
|
||
if (!window.__APOLLO_CLIENT__) return map;
|
||
const cache = window.__APOLLO_CLIENT__.cache.extract();
|
||
for (const cacheKey of Object.keys(cache)) {
|
||
// Keys like "Lot:289880823" or "Item:12345"
|
||
const m = cacheKey.match(/^(?:Lot|Item|AuctionLot|Product):(\\d+)$/);
|
||
if (!m) continue;
|
||
const id = m[1];
|
||
const entry = cache[cacheKey];
|
||
const urls = [];
|
||
// HiBid: pictures[].fullSizeLocation / hdThumbnailLocation
|
||
const pics = entry.pictures || entry.images || entry.photos || [];
|
||
if (Array.isArray(pics)) {
|
||
for (const p of pics) {
|
||
const u = typeof p === 'string' ? p :
|
||
(p.fullSizeLocation || p.hdThumbnailLocation ||
|
||
p.thumbnailLocation || p.url || p.src || p.imageUrl || '');
|
||
if (u && u.startsWith('http')) urls.push(u);
|
||
}
|
||
}
|
||
// Also check featuredPicture
|
||
const fp = entry.featuredPicture;
|
||
if (fp && typeof fp === 'object') {
|
||
const u = fp.fullSizeLocation || fp.hdThumbnailLocation || fp.url || '';
|
||
if (u && u.startsWith('http') && !urls.includes(u)) urls.push(u);
|
||
}
|
||
if (urls.length) map[id] = urls;
|
||
}
|
||
} catch(e) {}
|
||
return map;
|
||
})();
|
||
|
||
// Get images for a given href via Apollo cache (HiBid: /lot/{id}/...)
|
||
const apolloImagesForHref = (href) => {
|
||
try {
|
||
const m = (href || '').match(/\\/(?:lot|item|product)\\/(\\d+)/i);
|
||
if (m && _apolloImgMap[m[1]]) return _apolloImgMap[m[1]];
|
||
} catch(e) {}
|
||
return null;
|
||
};
|
||
|
||
// Get text from element, with label stripping
|
||
const priceText = el => {
|
||
if (!el) return '';
|
||
let t = (el.innerText || el.getAttribute('data-price') ||
|
||
el.getAttribute('content') || '').trim();
|
||
// Multi-line: take first line with a digit
|
||
if (t.includes('\\n')) {
|
||
const ln = t.split('\\n').find(x => /[0-9]/.test(x));
|
||
if (ln) t = ln.trim();
|
||
}
|
||
// Strip label prefix: "Current Bid: $45" → "$45"
|
||
t = t.replace(/^[^[0-9]$£€¥₹CA]*(?=[$£€¥₹[0-9]])/, '').trim();
|
||
return t;
|
||
};
|
||
|
||
// Extract image URLs from a container element (up to 5, no tiny icons)
|
||
// Apollo href override: if a lot href is provided, tries Apollo cache first.
|
||
const extractImages = (root, href) => {
|
||
// Apollo cache takes priority — full gallery, no DOM scanning needed
|
||
const apolloUrls = apolloImagesForHref(href);
|
||
if (apolloUrls && apolloUrls.length) return apolloUrls.slice(0, 10);
|
||
|
||
const urls = [];
|
||
if (!root) return urls;
|
||
const imgs = root.querySelectorAll('img');
|
||
for (const img of Array.from(imgs).slice(0, 10)) {
|
||
const src = img.getAttribute('data-src') || img.getAttribute('data-lazy-src') ||
|
||
img.getAttribute('data-original') || img.getAttribute('data-lazy') ||
|
||
img.src || '';
|
||
if (!src || !src.startsWith('http') || src.length < 20) continue;
|
||
// Skip tiny icons / trackers (check rendered or attribute dimensions)
|
||
const w = img.naturalWidth || parseInt(img.getAttribute('width') || '0') || 0;
|
||
const h = img.naturalHeight || parseInt(img.getAttribute('height') || '0') || 0;
|
||
if (w > 0 && w < 40) continue;
|
||
if (h > 0 && h < 40) continue;
|
||
if (!urls.includes(src)) urls.push(src);
|
||
if (urls.length >= 5) break;
|
||
}
|
||
return urls;
|
||
};
|
||
|
||
const TITLE_SELS = [
|
||
'.s-item__title',
|
||
'h2.lot-title', 'h3.lot-title',
|
||
'h2[class*="lot"]', 'h3[class*="lot"]',
|
||
'h2[class*="title"]', 'h3[class*="title"]',
|
||
'.lot-title', '.item-title', '.listing-title',
|
||
'.product-title', '.card-title',
|
||
'h2', 'h3',
|
||
];
|
||
const PRICE_SELS = [
|
||
'.s-item__price',
|
||
'[class*="current-bid"] [class*="amount"]',
|
||
'[class*="current-bid"]',
|
||
'[class*="bid-amount"]', '[class*="bid-price"]',
|
||
'[class*="currentBid"]',
|
||
'[itemprop="price"]',
|
||
'[class*="price-value"]', '[class*="price_value"]',
|
||
'span[class*="price"]',
|
||
'.price', '[class*="price"]',
|
||
];
|
||
const TIME_SELS = [
|
||
'.s-item__time-left', '.s-item__time-end',
|
||
'[class*="time-left"]', '[class*="timeleft"]',
|
||
'[class*="countdown"]', '[class*="closing-time"]',
|
||
'[class*="closingTime"]', '[class*="time-remaining"]',
|
||
'[class*="ends-in"]', '[class*="lot-end"]',
|
||
'[class*="auction-end"]', '[class*="expire"]',
|
||
'[class*="end-time"]', 'time',
|
||
];
|
||
const LINK_SELS = [
|
||
"a[href*='/lot/']", "a[href*='/item/']",
|
||
"a[href*='/itm/']", "a[href*='/listing/']",
|
||
"a[href*='/product/']", "a[href*='/auction/']", "a",
|
||
];
|
||
|
||
// ── 3. Per-card extraction ─────────────────────────────────────────────────
|
||
const rows = [];
|
||
|
||
if (cards.length >= 2) {
|
||
// NORMAL PATH: each card contains all its own data
|
||
for (const card of cards.slice(0, 30)) {
|
||
const titleEl = qIn(card, TITLE_SELS);
|
||
const title = titleEl ? titleEl.innerText.trim() : '';
|
||
if (!title || title.length < 4) continue;
|
||
|
||
const pt = priceText(qIn(card, PRICE_SELS));
|
||
const tt = (qIn(card, TIME_SELS) || {innerText:''}).innerText.trim();
|
||
|
||
// Location
|
||
const LOC_SELS = [
|
||
'[data-location]','[class*="location"]','[class*="country"]','[class*="city"]',
|
||
'.item-location','.lot-location','.seller-location',
|
||
'span[itemprop="addressLocality"]','span[itemprop="addressCountry"]',
|
||
];
|
||
let locEl = qIn(card, LOC_SELS);
|
||
let location = locEl ? locEl.textContent.trim().replace(/\\s+/g,' ').slice(0,80) : '';
|
||
|
||
// Link: card itself → title's closest <a> → first link sel inside card
|
||
let href = card.tagName === 'A' ? card.href : '';
|
||
if (!href && titleEl) {
|
||
const a = titleEl.closest('a') || titleEl.querySelector('a');
|
||
if (a) href = a.href;
|
||
}
|
||
if (!href) {
|
||
const a = qIn(card, LINK_SELS);
|
||
if (a) href = a.href;
|
||
}
|
||
|
||
// Images — Apollo cache first, then DOM fallback
|
||
const images = extractImages(card, href);
|
||
|
||
rows.push({ title, price_text: pt, time_text: tt, location, href: href || '', images });
|
||
}
|
||
}
|
||
|
||
// ── 4. FALLBACK: no card containers found (Angular SPA etc.) ───────────────
|
||
// Find all title elements, then for each one walk UP the DOM to
|
||
// find the smallest ancestor that also contains a price element.
|
||
// That ancestor is the logical "card" for this title.
|
||
if (rows.length === 0) {
|
||
const titleEls = [];
|
||
for (const s of TITLE_SELS) {
|
||
try {
|
||
const found = document.querySelectorAll(s);
|
||
if (found.length >= 2) {
|
||
titleEls.push(...Array.from(found));
|
||
break;
|
||
}
|
||
} catch(e) {}
|
||
}
|
||
|
||
for (const titleEl of titleEls.slice(0, 30)) {
|
||
const title = titleEl.innerText.trim();
|
||
if (!title || title.length < 4) continue;
|
||
|
||
// Walk up max 8 levels to find a container with price/time
|
||
let container = titleEl.parentElement;
|
||
let pt = '', tt = '', location = '', href = '';
|
||
for (let depth = 0; depth < 8 && container; depth++) {
|
||
pt = priceText(qIn(container, PRICE_SELS));
|
||
tt = (qIn(container, TIME_SELS) || {innerText:''}).innerText.trim();
|
||
if (pt || depth >= 5) break; // found or deep enough
|
||
container = container.parentElement;
|
||
}
|
||
|
||
// Location from container
|
||
if (container) {
|
||
const LOC_SELS = [
|
||
'[data-location]','[class*="location"]','[class*="country"]','[class*="city"]',
|
||
'.item-location','.lot-location','.seller-location',
|
||
'span[itemprop="addressLocality"]','span[itemprop="addressCountry"]',
|
||
];
|
||
let locEl = qIn(container, LOC_SELS);
|
||
location = locEl ? locEl.textContent.trim().replace(/\\s+/g,' ').slice(0,80) : '';
|
||
}
|
||
|
||
// Link from container
|
||
if (container) {
|
||
const a = container.tagName === 'A' ? container :
|
||
titleEl.closest('a') || qIn(container, LINK_SELS);
|
||
if (a) href = a.href || '';
|
||
}
|
||
|
||
// Images — Apollo cache first (uses href), then DOM fallback
|
||
const images = extractImages(container, href);
|
||
|
||
rows.push({ title, price_text: pt, time_text: tt, location, href, images });
|
||
}
|
||
}
|
||
|
||
return rows;
|
||
}"""
|
||
|
||
# ── JS_DETAIL_IMAGES ─────────────────────────────────────────────────────────
|
||
# Runs on a LOT DETAIL page (not search results). 5-layer image extraction:
|
||
# Layer 0 — Apollo Client cache (HiBid / GraphQL SPAs)
|
||
# Layer 1 — JSON-LD structured data
|
||
# Layer 2 — Open Graph meta tags
|
||
# Layer 3 — DOM <img> elements (all data-* attrs)
|
||
# Layer 4 — <picture><source> srcset
|
||
# Returns plain array of image URL strings (up to 10).
|
||
JS_DETAIL_IMAGES = r"""() => {
|
||
const seen = new Set();
|
||
const imgUrls = [];
|
||
const addUrl = (src) => {
|
||
if (!src || typeof src !== 'string') return false;
|
||
src = src.trim();
|
||
const lc = src.toLowerCase();
|
||
if (!src.startsWith('http') || src.length < 20) return false;
|
||
if (lc.includes('1x1') || lc.includes('pixel.gif') ||
|
||
lc.includes('tracking') || lc.includes('beacon')) return false;
|
||
if (seen.has(src)) return false;
|
||
seen.add(src);
|
||
imgUrls.push(src);
|
||
return true;
|
||
};
|
||
|
||
// Layer 0: Apollo Client cache (HiBid & GraphQL SPAs)
|
||
try {
|
||
if (window.__APOLLO_CLIENT__) {
|
||
const ac = window.__APOLLO_CLIENT__.cache.extract();
|
||
const lotIdMatch = location.href.match(/\/(?:lot|item|product)\/(\d+)/i);
|
||
const lotId = lotIdMatch ? lotIdMatch[1] : null;
|
||
const keys = lotId ? ['Lot:'+lotId, 'Item:'+lotId] :
|
||
Object.keys(ac).filter(k => /^(Lot|Item|AuctionLot|Product):/.test(k));
|
||
for (const ck of keys) {
|
||
const e = ac[ck]; if (!e) continue;
|
||
const pics = e.pictures || e.images || e.photos || [];
|
||
if (Array.isArray(pics)) {
|
||
for (const p of pics) {
|
||
const u = typeof p === 'string' ? p :
|
||
(p.fullSizeLocation || p.hdThumbnailLocation ||
|
||
p.thumbnailLocation || p.url || p.src || '');
|
||
addUrl(u);
|
||
}
|
||
}
|
||
const fp = e.featuredPicture;
|
||
if (fp && typeof fp === 'object')
|
||
addUrl(fp.fullSizeLocation || fp.hdThumbnailLocation || fp.url || '');
|
||
}
|
||
}
|
||
} catch(e) {}
|
||
|
||
// Layer 1: JSON-LD structured data
|
||
try {
|
||
for (const s of document.querySelectorAll('script[type="application/ld+json"]')) {
|
||
try {
|
||
const parsed = JSON.parse(s.textContent || '');
|
||
for (const node of (Array.isArray(parsed) ? parsed : [parsed])) {
|
||
const imgs = node.image || node.photo || [];
|
||
for (const img of (Array.isArray(imgs) ? imgs : [imgs])) {
|
||
const u = typeof img === 'string' ? img :
|
||
(img.url || img.contentUrl || img['@id'] || '');
|
||
addUrl(u);
|
||
}
|
||
}
|
||
} catch(e) {}
|
||
}
|
||
} catch(e) {}
|
||
|
||
// Layer 2: Open Graph meta tags
|
||
if (imgUrls.length < 2) {
|
||
try {
|
||
for (const m of document.querySelectorAll(
|
||
'meta[property="og:image"], meta[name="og:image"], ' +
|
||
'meta[property="og:image:url"], meta[itemprop="image"]'))
|
||
addUrl(m.getAttribute('content') || m.getAttribute('href') || '');
|
||
} catch(e) {}
|
||
}
|
||
|
||
// Layer 3: DOM img elements
|
||
if (imgUrls.length < 5) {
|
||
try {
|
||
for (const img of document.querySelectorAll('img')) {
|
||
if (imgUrls.length >= 10) break;
|
||
let picked = '';
|
||
for (const attr of img.attributes) {
|
||
const v = attr.value || '';
|
||
if ((attr.name.startsWith('data-') || attr.name === 'src') &&
|
||
v.startsWith('http') && v.length > 20 &&
|
||
/\.(jpe?g|png|webp|gif)(\?|$)/i.test(v)) { picked = v; break; }
|
||
}
|
||
if (!picked) picked = img.src || '';
|
||
if (!picked.startsWith('http')) continue;
|
||
const w = parseInt(img.getAttribute('width') || '0') || 0;
|
||
const h = parseInt(img.getAttribute('height') || '0') || 0;
|
||
if ((w > 0 && w < 50) || (h > 0 && h < 50)) continue;
|
||
addUrl(picked);
|
||
}
|
||
} catch(e) {}
|
||
}
|
||
|
||
// Layer 4: <picture><source> srcset
|
||
if (imgUrls.length < 5) {
|
||
try {
|
||
for (const s of document.querySelectorAll('picture source[srcset], source[srcset]')) {
|
||
if (imgUrls.length >= 10) break;
|
||
const parts = (s.getAttribute('srcset') || '').split(',')
|
||
.map(p => p.trim().split(/\s+/)[0]).filter(Boolean);
|
||
addUrl(parts[parts.length - 1] || '');
|
||
}
|
||
} catch(e) {}
|
||
}
|
||
|
||
return imgUrls.slice(0, 10);
|
||
}"""
|
||
|
||
|
||
# ── JS_DETAIL_WAIT — Apollo cache readiness check (passed to wait_for_function)
|
||
JS_APOLLO_WAIT = """(lotId) => {
|
||
try {
|
||
if (!window.__APOLLO_CLIENT__) return false;
|
||
const c = window.__APOLLO_CLIENT__.cache.extract();
|
||
if (lotId) {
|
||
for (const prefix of ['Lot','Item','AuctionLot','Product']) {
|
||
const entry = c[prefix + ':' + lotId];
|
||
if (entry && Array.isArray(entry.pictures) && entry.pictures.length > 0)
|
||
return true;
|
||
}
|
||
return false;
|
||
}
|
||
return Object.keys(c).some(k => /^(Lot|Item|AuctionLot|Product):\\d/.test(k));
|
||
} catch(e) { return false; }
|
||
}"""
|
||
|
||
|
||
async def _fetch_listing_images_batch(page_context, new_links: list, db) -> int:
|
||
"""
|
||
Visit each new listing's detail page to extract full image gallery.
|
||
Called immediately after initial scrape so images are available right away
|
||
instead of waiting for the 5-minute price refresh.
|
||
|
||
Args:
|
||
page_context: Playwright BrowserContext (from page.context)
|
||
new_links: list of (listing_id, link) tuples
|
||
db: SQLAlchemy session
|
||
|
||
Returns:
|
||
Number of listings whose images were updated.
|
||
"""
|
||
updated = 0
|
||
for listing_id, link in new_links:
|
||
if not link or link.startswith("no-link"):
|
||
continue
|
||
try:
|
||
dp = await page_context.new_page()
|
||
await dp.route(
|
||
"**/*.{png,jpg,jpeg,gif,svg,woff,woff2,ttf,mp4,webp}",
|
||
lambda route: route.abort(),
|
||
)
|
||
await dp.goto(link, timeout=25_000, wait_until="domcontentloaded")
|
||
|
||
# Smart wait — Apollo cache polling for GraphQL SPAs
|
||
_lot_id_m = re.search(r'/(?:lot|item|product)/(\d+)', link, re.IGNORECASE)
|
||
_lot_id = _lot_id_m.group(1) if _lot_id_m else None
|
||
try:
|
||
await dp.wait_for_function(JS_APOLLO_WAIT, arg=_lot_id,
|
||
timeout=8000, polling=200)
|
||
except Exception:
|
||
pass
|
||
await dp.wait_for_timeout(1200)
|
||
|
||
img_urls = await dp.evaluate(JS_DETAIL_IMAGES)
|
||
await dp.close()
|
||
|
||
# Always save whatever the detail page returned — it is the
|
||
# authoritative source (full-size Apollo URLs, not search thumbnails).
|
||
# Guard: only skip if detail page returned nothing at all (0 images).
|
||
if img_urls:
|
||
listing = db.query(Listing).filter(Listing.id == listing_id).first()
|
||
if listing:
|
||
listing.images = json.dumps(img_urls[:10])
|
||
db.flush()
|
||
db.commit()
|
||
updated += 1
|
||
print(f"[Images] 🖼️ {listing.title[:35]} → {len(img_urls)} image(s)")
|
||
except Exception as exc:
|
||
print(f"[Images] ⚠️ {link[:55]}: {exc}")
|
||
try:
|
||
await dp.close()
|
||
except Exception:
|
||
pass
|
||
continue
|
||
return updated
|
||
|
||
|
||
async def scrape_site(
|
||
page,
|
||
site: TargetSite,
|
||
keyword: Keyword,
|
||
db: Session,
|
||
delay_post_search: int = 0,
|
||
delay_page_hold: int = 0,
|
||
delay_site_open: int = 0,
|
||
is_first_keyword: bool = False,
|
||
humanize_level: str = "heavy", # raw | low | medium | heavy
|
||
) -> int:
|
||
"""
|
||
Navigate to a target site and scrape results for one keyword.
|
||
|
||
Navigation mode is determined automatically by whether url_template
|
||
contains the literal string '{keyword}':
|
||
|
||
Mode A — Direct Template ({keyword} present)
|
||
The URL is built by substituting {keyword} and navigated to directly.
|
||
This is the fast path for sites with stable search-result URLs
|
||
(eBay, Amazon, etc.). If a search_selector is also provided it is
|
||
used only as a fallback when the direct navigation fails.
|
||
|
||
Mode B — Homepage Search (no {keyword} in url_template)
|
||
The bot navigates to url_template as a landing page, then runs
|
||
Search Discovery to locate the search input automatically using
|
||
ARIA roles, placeholder text, and label associations before
|
||
falling back to search_selector if one is stored.
|
||
search_selector is optional — leave it blank and Ghost Node will
|
||
auto-discover the search box on any 2026-era website.
|
||
|
||
Returns the count of qualifying new listings written to the DB.
|
||
"""
|
||
new_count = 0
|
||
_new_listing_links = [] # (listing_id, link) — for detail-page image fetch
|
||
is_direct_mode = "{keyword}" in site.url_template
|
||
|
||
# ── N14: Login guard — check BEFORE any navigation ───────────────────────
|
||
# If the site requires login and the feature is enabled, verify the
|
||
# browser session is authenticated. If not logged in, skip this site
|
||
# and Telegram-alert the user to use the 🔑 Login button in the dashboard.
|
||
if site.requires_login and site.login_enabled:
|
||
already_logged_in = await _check_login_status(page, site)
|
||
if not already_logged_in:
|
||
msg = (
|
||
f"🔑 <b>Login required — {site.name}</b>\n"
|
||
f"Ghost Node is not logged into this site.\n"
|
||
f"Open the Dashboard → Sites tab and press the 🔑 <b>Login</b> button "
|
||
f"next to <i>{site.name}</i>, then log in manually.\n"
|
||
f"Scraping this site is paused until you log in."
|
||
)
|
||
asyncio.create_task(send_alert(msg, subject=f"Login required — {site.name}"))
|
||
print(f"[Login] ⛔ {site.name}: not logged in — skipping. Use dashboard 🔑 Login button.")
|
||
return new_count
|
||
|
||
try:
|
||
# ── MODE A: Direct template navigation ───────────────────────────────
|
||
if is_direct_mode:
|
||
target_url = site.url_template.replace(
|
||
"{keyword}", keyword.term.replace(" ", "+")
|
||
)
|
||
print(
|
||
f"[Scraper] MODE=DIRECT {site.name} | "
|
||
f"'{keyword.term}' → {target_url}"
|
||
)
|
||
|
||
# ── Homepage pre-visit (medium / heavy only) ─────────────────────
|
||
# Raw/low jump straight to the results URL.
|
||
# Medium/heavy visit the homepage first for a natural referer chain.
|
||
if is_first_keyword and humanize_level.strip().lower() in ("medium", "heavy"):
|
||
try:
|
||
# Derive homepage from the template URL
|
||
from urllib.parse import urlparse as _uparse
|
||
_parsed = _uparse(site.url_template)
|
||
_homepage = f"{_parsed.scheme}://{_parsed.netloc}/"
|
||
print(f"[Scraper] 🏠 Pre-visiting homepage: {_homepage}")
|
||
await page.goto(_homepage, timeout=30_000,
|
||
wait_until="domcontentloaded",
|
||
referer="https://www.google.com/")
|
||
# Human idle on homepage — as if glancing at the front page
|
||
await asyncio.sleep(_jitter(2.5, pct=0.5))
|
||
await _human_mouse(page)
|
||
await asyncio.sleep(_jitter(1.2, pct=0.4))
|
||
await _human_scroll(page, steps=random.randint(1, 3))
|
||
await asyncio.sleep(_jitter(1.0, pct=0.4))
|
||
except Exception as hp_exc:
|
||
print(f"[Scraper] ⚠️ Homepage pre-visit skipped: {hp_exc}")
|
||
|
||
try:
|
||
await page.goto(target_url, timeout=60_000, wait_until="networkidle",
|
||
referer=f"https://{_parsed.netloc}/" if is_first_keyword else "https://www.google.com/")
|
||
except Exception as goto_exc:
|
||
print(
|
||
f"[Scraper] ⚠️ networkidle timeout for {site.name}, "
|
||
f"retrying with domcontentloaded: {goto_exc}"
|
||
)
|
||
await page.goto(
|
||
target_url, timeout=60_000, wait_until="domcontentloaded"
|
||
)
|
||
|
||
# ── N2: CAPTCHA check after navigation ────────────────────────────
|
||
if await _detect_captcha(page):
|
||
print(f"[CAPTCHA] 🤖 Detected on {site.name} — attempting solve…")
|
||
_solver = _get_config("captcha_solver", "none").strip().lower()
|
||
_api_key = _get_config("captcha_api_key", "").strip()
|
||
_solved = False
|
||
if _solver == "2captcha" and _api_key:
|
||
_solved = await _solve_captcha_2captcha(page, _api_key)
|
||
elif _solver == "capsolver" and _api_key:
|
||
_solved = await _solve_captcha_capsolver(page, _api_key)
|
||
if not _solved:
|
||
print(f"[CAPTCHA] ❌ Could not solve CAPTCHA on {site.name} — skipping.")
|
||
_record_site_error(site.id, "CAPTCHA not solved")
|
||
return new_count
|
||
|
||
# ── N3: Block detection after navigation ──────────────────────────
|
||
if await _detect_block(page):
|
||
print(f"[Block] 🚫 {site.name} appears to be blocking us.")
|
||
_record_site_error(site.id, "Block/rate-limit detected")
|
||
return new_count
|
||
# ── Website-launch delay (Mode A, first keyword only) ────────────
|
||
# Page has just opened. Delay fires here — before any scraping.
|
||
# Subsequent keywords on the same site skip this (is_first_keyword=False).
|
||
if is_first_keyword and delay_site_open > 0:
|
||
print(
|
||
f"[Scraper] 🌐 Website-launch delay: {delay_site_open}s "
|
||
f"— {site.name} opened, holding before scraping "
|
||
f"({keyword.term})"
|
||
)
|
||
await asyncio.sleep(delay_site_open)
|
||
print(f"[Scraper] ✅ Website-launch delay done")
|
||
|
||
# ── MODE B: Homepage search interaction ──────────────────────────────
|
||
else:
|
||
# search_selector is optional — _discover_search_input will try
|
||
# four semantic strategies (ARIA role, searchbox, placeholder, label)
|
||
# before falling back to the CSS selector. An empty selector simply
|
||
# means all four semantic strategies run with no CSS safety net;
|
||
# if they all fail, _discover_search_input raises RuntimeError and
|
||
# the except below logs it clearly.
|
||
sel = (site.search_selector or "").strip()
|
||
|
||
print(
|
||
f"[Scraper] MODE=HOMEPAGE {site.name} | "
|
||
f"'{keyword.term}' via {site.url_template}"
|
||
+ (f" selector='{sel}'" if sel else " (auto-discover search box)")
|
||
)
|
||
|
||
# Step 1 — land on the homepage
|
||
await page.goto(
|
||
site.url_template, timeout=60_000, wait_until="domcontentloaded"
|
||
)
|
||
# ── Website-launch delay (Mode B, first keyword only) ────────────
|
||
# Homepage has just opened — delay fires here, before the search
|
||
# box is touched. Subsequent keywords skip this entirely.
|
||
if is_first_keyword and delay_site_open > 0:
|
||
print(
|
||
f"[Scraper] 🌐 Website-launch delay: {delay_site_open}s "
|
||
f"— {site.name} homepage opened, holding before search"
|
||
)
|
||
await asyncio.sleep(delay_site_open)
|
||
print(f"[Scraper] ✅ Website-launch delay done")
|
||
|
||
# Step 2 — Search Discovery: semantic locators → CSS fallback
|
||
try:
|
||
search_el = await _discover_search_input(page, sel, site.name)
|
||
|
||
# ── Robust input — works minimised / in background ────────────
|
||
# Uses only Playwright Locator methods which are JS-driven
|
||
# internally. No OS window focus is ever needed.
|
||
#
|
||
# Why NOT page.evaluate() + element_handle():
|
||
# _discover_search_input returns a Locator. Calling
|
||
# locator.element_handle() returns None when the element is
|
||
# not uniquely resolved at that instant (e.g. during a
|
||
# re-navigation) — passing None into page.evaluate() means
|
||
# `el` is null inside the JS, so el.focus() throws
|
||
# "TypeError: el.focus is not a function".
|
||
#
|
||
# Locator.focus() / .fill() / .press() resolve the element
|
||
# fresh on every call, retry automatically on transient
|
||
# detachment, and inject their actions via CDP (Chrome
|
||
# DevTools Protocol) — no OS keyboard or mouse events.
|
||
|
||
# 1. Focus via Locator — CDP-driven, no OS focus needed
|
||
await search_el.focus()
|
||
await asyncio.sleep(_jitter(0.4, pct=0.4))
|
||
|
||
# 2. Type search term — mode depends on humanize_level
|
||
await search_el.fill("") # clear first
|
||
_hlvl_type = humanize_level.strip().lower()
|
||
|
||
if _hlvl_type == "raw":
|
||
# Raw: instant fill — no timing simulation at all
|
||
await search_el.fill(keyword.term)
|
||
|
||
elif _hlvl_type == "low":
|
||
# Low: fill in one shot, small pre/post pause
|
||
await asyncio.sleep(random.uniform(0.1, 0.3))
|
||
await search_el.fill(keyword.term)
|
||
await asyncio.sleep(random.uniform(0.1, 0.3))
|
||
|
||
elif _hlvl_type == "medium":
|
||
# Medium: char-by-char typing, variable WPM, no typos
|
||
await asyncio.sleep(_jitter(0.3, pct=0.4))
|
||
for char in keyword.term:
|
||
await search_el.press(char)
|
||
if char == " ":
|
||
await asyncio.sleep(random.uniform(0.10, 0.25))
|
||
else:
|
||
await asyncio.sleep(random.uniform(0.05, 0.10))
|
||
await asyncio.sleep(_jitter(0.4, pct=0.4))
|
||
|
||
else: # heavy
|
||
# Heavy: char-by-char, variable WPM, 12% typo+backspace,
|
||
# word boundary rhythm, pre-submit re-read pause
|
||
await asyncio.sleep(_jitter(0.5, pct=0.5))
|
||
typo_chars = "qwertyuiopasdfghjklzxcvbnm"
|
||
for char in keyword.term:
|
||
if random.random() < 0.12 and len(keyword.term) > 3:
|
||
wrong = random.choice(typo_chars)
|
||
await search_el.press(wrong)
|
||
await asyncio.sleep(random.uniform(0.08, 0.18))
|
||
await search_el.press("Backspace")
|
||
await asyncio.sleep(random.uniform(0.05, 0.15))
|
||
await search_el.press(char)
|
||
if char == " ":
|
||
await asyncio.sleep(random.uniform(0.12, 0.35))
|
||
else:
|
||
await asyncio.sleep(random.uniform(0.045, 0.110))
|
||
# Pre-submit pause — user re-reads what they typed
|
||
await asyncio.sleep(_jitter(0.6, pct=0.5))
|
||
|
||
# 3. Dispatch an explicit 'input' event as belt-and-braces
|
||
await search_el.dispatch_event("input")
|
||
|
||
# 4. Locator.press() sends Enter via CDP
|
||
await search_el.press("Enter")
|
||
|
||
# Step 3 — wait for results page to settle
|
||
await page.wait_for_load_state("networkidle", timeout=60_000)
|
||
|
||
except Exception as sel_exc:
|
||
print(
|
||
f"[Scraper] ❌ {site.name}: Search Discovery failed — "
|
||
f"{sel_exc}"
|
||
)
|
||
return 0 # bail — don't scrape the homepage itself
|
||
|
||
# ── N17: Try AI-generated selectors first ────────────────────────────
|
||
# If this site has been auto-adapted (SiteSelectors row with confidence
|
||
# >= 50 and not marked stale), use those precise selectors to extract
|
||
# directly. On success, jump straight to the hold/loop logic.
|
||
# On failure (0 results), mark stale and fall through to JS_EXTRACT.
|
||
_ai_adapted_rows: list[dict] = []
|
||
_ss_row = None
|
||
try:
|
||
_ss_db = SessionLocal()
|
||
_ss_row = _ss_db.query(SiteSelectors).filter(
|
||
SiteSelectors.site_id == site.id,
|
||
SiteSelectors.confidence >= 50,
|
||
SiteSelectors.stale == False, # noqa: E712
|
||
).first()
|
||
_ss_db.close()
|
||
except Exception:
|
||
_ss_row = None
|
||
|
||
if _ss_row:
|
||
print(f"[AutoAdapt] ⚡ {site.name}: using AI selectors "
|
||
f"(conf={_ss_row.confidence}, container='{_ss_row.container_sel}')")
|
||
_ai_adapted_rows = await _extract_with_selectors(page, _ss_row)
|
||
if _ai_adapted_rows:
|
||
print(f"[AutoAdapt] ✅ {site.name}: {len(_ai_adapted_rows)} rows via AI selectors")
|
||
# Update last_tested_at
|
||
try:
|
||
_upd_db = SessionLocal()
|
||
_upd_ss = _upd_db.query(SiteSelectors).filter(SiteSelectors.site_id == site.id).first()
|
||
if _upd_ss:
|
||
_upd_ss.last_tested_at = datetime.now()
|
||
_upd_db.flush()
|
||
_upd_db.commit()
|
||
_upd_db.close()
|
||
except Exception:
|
||
pass
|
||
else:
|
||
# AI selectors returned nothing — mark stale, fall through to JS_EXTRACT
|
||
print(f"[AutoAdapt] ⚠️ {site.name}: AI selectors returned 0 rows — marking stale")
|
||
try:
|
||
_stale_db = SessionLocal()
|
||
_stale_ss = _stale_db.query(SiteSelectors).filter(SiteSelectors.site_id == site.id).first()
|
||
if _stale_ss:
|
||
_stale_ss.stale = True
|
||
_stale_db.flush()
|
||
_stale_db.commit()
|
||
_stale_db.close()
|
||
except Exception:
|
||
pass
|
||
# Auto-heal: if Auto-Adapter is enabled, queue a re-adapt in background
|
||
if _get_config("auto_adapt_enabled", "false").lower() == "true":
|
||
print(f"[AutoAdapt] 🔄 Queuing background re-adapt for {site.name}…")
|
||
asyncio.create_task(adapt_site_now(site.id))
|
||
|
||
# ── Collect listing elements ─────────────────────────────────────────
|
||
# These selectors are tried in order; the first one that returns
|
||
# results wins. ShopGoodwill items match div.lot-card / .item-card.
|
||
listing_selectors = [
|
||
"li.s-item", # eBay
|
||
".item-cell",
|
||
"article.product-pod",
|
||
"div.lot-card", # ShopGoodwill
|
||
".item-card", # ShopGoodwill alternate
|
||
"div.listing-item",
|
||
"[data-listing-id]",
|
||
"div[class*='result']",
|
||
"li[class*='product']",
|
||
]
|
||
|
||
items = []
|
||
for sel_try in listing_selectors:
|
||
items = await page.query_selector_all(sel_try)
|
||
if items:
|
||
print(
|
||
f"[Scraper] {site.name}: matched {len(items)} items "
|
||
f"via '{sel_try}'"
|
||
)
|
||
break
|
||
|
||
if not items:
|
||
# Last-resort: any anchor whose href looks like a product page
|
||
items = await page.query_selector_all(
|
||
"a[href*='itm'], a[href*='listing'], a[href*='item'], "
|
||
"a[href*='/product/'], a[href*='/lot/']"
|
||
)
|
||
if items:
|
||
print(
|
||
f"[Scraper] {site.name}: fallback anchor match "
|
||
f"({len(items)} items)"
|
||
)
|
||
|
||
if not items:
|
||
print(f"[Scraper] ⚠️ {site.name}: no listing elements found on page.")
|
||
|
||
# ── Delay 2: post-search pause before scraping ────────────────────
|
||
if delay_post_search > 0:
|
||
print(
|
||
f"[Scraper] ⏳ Post-search delay: {delay_post_search}s "
|
||
f"— waiting {delay_post_search}s before scraping "
|
||
f"({site.name} | '{keyword.term}')"
|
||
)
|
||
await asyncio.sleep(delay_post_search)
|
||
print(f"[Scraper] ✅ Post-search delay done — starting scrape")
|
||
|
||
# ── Human simulation — level-gated ──────────────────────────────────
|
||
# raw: no simulation at all — bare requests, fastest, least safe
|
||
# low: one quick mouse move + one scroll pass
|
||
# medium: mouse + scroll + post-scroll idle
|
||
# heavy: full 5-step sequence with long idles and re-read behaviour
|
||
_hlvl = humanize_level.strip().lower()
|
||
|
||
if _hlvl == "raw":
|
||
pass # no simulation whatsoever
|
||
|
||
elif _hlvl == "low":
|
||
await _human_mouse(page)
|
||
await asyncio.sleep(_jitter(0.4, pct=0.4))
|
||
await _human_scroll(page, steps=random.randint(1, 2))
|
||
await asyncio.sleep(_jitter(0.4, pct=0.4))
|
||
|
||
elif _hlvl == "medium":
|
||
await asyncio.sleep(_jitter(0.7, pct=0.4)) # brief page-load idle
|
||
await _human_mouse(page)
|
||
await asyncio.sleep(_jitter(0.5, pct=0.4))
|
||
await _human_scroll(page, steps=random.randint(2, 4))
|
||
await asyncio.sleep(_jitter(0.8, pct=0.4)) # post-scroll idle
|
||
|
||
else: # heavy (default)
|
||
# Full 5-step sequence:
|
||
# 1. Brief idle — page just loaded, user orients themselves
|
||
# 2. Mouse moves toward content area
|
||
# 3. Scroll through results with read-rhythm pauses
|
||
# 4. More hover-reading
|
||
# 5. "Thinking" pause before acting
|
||
await asyncio.sleep(_jitter(1.2, pct=0.5)) # page-load idle
|
||
await _human_mouse(page) # initial cursor
|
||
await asyncio.sleep(_jitter(0.8, pct=0.5))
|
||
await _human_scroll(page, steps=random.randint(4, 7)) # read results
|
||
await asyncio.sleep(_jitter(0.9, pct=0.5))
|
||
await _human_mouse(page) # hover-reading
|
||
await asyncio.sleep(_jitter(1.5, pct=0.6)) # thinking pause
|
||
|
||
# ── Delay 3: page-hold re-scrape loop ───────────────────────────────
|
||
# Holds the results page for exactly `delay_page_hold` seconds total.
|
||
# The scraper runs a full pass, and as soon as that pass completes it
|
||
# checks if time remains — if yes it immediately starts the next pass
|
||
# with no idle wait between passes. The loop only ends when the hold
|
||
# timer has fully expired OR on the first pass when no hold is set.
|
||
# The DB unique-link constraint deduplicates across all passes —
|
||
# each listing URL is written exactly once, no clones ever saved.
|
||
_hold_deadline = time.time() + (delay_page_hold if delay_page_hold > 0 else 0)
|
||
_pass_num = 0
|
||
|
||
while True:
|
||
_pass_num += 1
|
||
_pass_new = 0
|
||
_pass_start = time.time()
|
||
|
||
# Always re-query the DOM on every pass so any items that loaded
|
||
# after the initial page-settle are captured.
|
||
items_current = []
|
||
for sel_try in listing_selectors:
|
||
items_current = await page.query_selector_all(sel_try)
|
||
if items_current:
|
||
break
|
||
if not items_current:
|
||
items_current = await page.query_selector_all(
|
||
"a[href*='itm'], a[href*='listing'], a[href*='item'], "
|
||
"a[href*='/product/'], a[href*='/lot/']"
|
||
)
|
||
items_current = items_current[:30]
|
||
|
||
# Log every pass — show remaining hold time
|
||
if delay_page_hold > 0:
|
||
_remaining = max(0, int(_hold_deadline - time.time()))
|
||
print(
|
||
f"[Scraper] 🔁 Page-hold pass #{_pass_num} — "
|
||
f"{_remaining}s / {delay_page_hold}s remaining "
|
||
f"({site.name} | '{keyword.term}')"
|
||
)
|
||
|
||
# ── Page-level parallel extraction ────────────────────────────────
|
||
# If AI selectors produced rows, use them directly (skip JS_EXTRACT).
|
||
# Otherwise run the universal JS extractor as before.
|
||
if _ai_adapted_rows:
|
||
page_data = _ai_adapted_rows
|
||
else:
|
||
page_data = await page.evaluate(JS_EXTRACT)
|
||
|
||
from urllib.parse import urljoin
|
||
for row in (page_data or []):
|
||
try:
|
||
title = row.get("title", "").strip()
|
||
price_text = row.get("price_text", "").strip()
|
||
time_text = row.get("time_text", "").strip()
|
||
location = row.get("location", "").strip()
|
||
href = row.get("href", "").strip()
|
||
images_list = row.get("images", [])
|
||
|
||
if not title or len(title) < 5:
|
||
continue
|
||
if href and not href.startswith("http"):
|
||
href = urljoin(page.url, href)
|
||
|
||
score = calculate_attribute_score(title, keyword.weight)
|
||
|
||
# ── Extract price and check N7 price filters ───────────────────
|
||
amount, currency = _extract_price_and_currency(price_text)
|
||
# N7: per-keyword price filter
|
||
kw_min = keyword.min_price
|
||
kw_max = keyword.max_price
|
||
if amount is not None:
|
||
if kw_min is not None and amount < kw_min:
|
||
print(f"[N7] ⬇️ Skipping '{title[:40]}' — price {amount} below min {kw_min}")
|
||
continue
|
||
if kw_max is not None and amount > kw_max:
|
||
print(f"[N7] ⬆️ Skipping '{title[:40]}' — price {amount} above max {kw_max}")
|
||
continue
|
||
|
||
# ── N16: AI filter — runs when keyword has an ai_target ───
|
||
_ai_enabled = _get_config("ai_filter_enabled", "false").lower() == "true"
|
||
_scoring_on = _get_config("scoring_enabled", "true").lower() == "true"
|
||
_ai_target = (keyword.ai_target or "").strip()
|
||
_ai_match_val = None
|
||
_ai_reason_val = None
|
||
|
||
if _ai_enabled and _ai_target:
|
||
# AI is the judge — score is calculated for display only.
|
||
# Score gate is bypassed regardless of scoring_enabled setting.
|
||
_ai_match_val, _ai_reason_val = await _ai_analyze(title, _ai_target)
|
||
if not _ai_match_val:
|
||
# AI rejected — save to DB as rejected (ai_match=0) but don't alert
|
||
_stats["total_scanned"] += 1
|
||
if not (href and db.query(Listing).filter(Listing.link == href).first()):
|
||
amount_rej, currency_rej = _extract_price_and_currency(price_text)
|
||
listing_rej = Listing(
|
||
title=title[:500],
|
||
price=amount_rej,
|
||
currency=currency_rej[:10] if currency_rej else "",
|
||
price_raw=_format_price(amount_rej, currency_rej)[:100],
|
||
time_left=_extract_time_left(time_text)[:60],
|
||
link=href or f"no-link-{random.randint(0,999999)}",
|
||
score=score,
|
||
keyword=keyword.term,
|
||
site_name=site.name,
|
||
location=location or "",
|
||
ai_match=0,
|
||
ai_reason=_ai_reason_val[:200] if _ai_reason_val else None,
|
||
images=json.dumps(images_list[:10]) if images_list else None,
|
||
)
|
||
db.add(listing_rej)
|
||
db.flush()
|
||
db.commit()
|
||
continue
|
||
elif _scoring_on:
|
||
# No AI target — fall back to score gate (only when scoring is enabled)
|
||
if score < 0:
|
||
continue
|
||
# else: scoring disabled AND no AI target → all lots pass through
|
||
|
||
_stats["total_scanned"] += 1
|
||
|
||
if href and db.query(Listing).filter(Listing.link == href).first():
|
||
continue
|
||
|
||
# ── N11: Cross-site deduplication (eBay only) ─────────────────
|
||
# If the same title already exists on a different eBay region
|
||
# within the last 24h, suppress the duplicate listing + alert.
|
||
_is_ebay = "ebay" in site.name.lower() or "ebay" in (site.url_template or "").lower()
|
||
if _is_ebay:
|
||
_cutoff = datetime.now() - timedelta(hours=24)
|
||
_recent_other_ebay = db.query(Listing).filter(
|
||
Listing.timestamp >= _cutoff,
|
||
Listing.site_name != site.name,
|
||
Listing.site_name.ilike("%ebay%"),
|
||
).all()
|
||
_is_cross_dupe = any(
|
||
difflib.SequenceMatcher(None, title.lower(), r.title.lower()).ratio() > 0.85
|
||
for r in _recent_other_ebay
|
||
)
|
||
if _is_cross_dupe:
|
||
print(f"[N11] 🔁 Cross-site duplicate suppressed: '{title[:50]}'")
|
||
continue
|
||
|
||
price_display = _format_price(amount, currency)
|
||
time_left_str = _extract_time_left(time_text)
|
||
|
||
listing = Listing(
|
||
title=title[:500],
|
||
price=amount,
|
||
currency=currency[:10] if currency else "",
|
||
price_raw=price_display[:100],
|
||
time_left=time_left_str[:60],
|
||
time_left_mins=round(timeLeftToMins(time_left_str), 4) if time_left_str and timeLeftToMins(time_left_str) != float('inf') else None,
|
||
price_updated_at=datetime.now(),
|
||
link=href or f"no-link-{random.randint(0,999999)}",
|
||
score=score,
|
||
keyword=keyword.term,
|
||
site_name=site.name,
|
||
location=location or "",
|
||
ai_match=1 if (_ai_enabled and _ai_target) else None,
|
||
ai_reason=_ai_reason_val[:200] if _ai_reason_val else None,
|
||
images=json.dumps(images_list[:10]) if images_list else None,
|
||
)
|
||
db.add(listing)
|
||
db.flush()
|
||
db.commit()
|
||
# N4: store USD price for cross-site sorting
|
||
if listing.price and listing.currency:
|
||
listing.price_usd = _convert_price(listing.price, listing.currency, "USD")
|
||
if listing.price_usd:
|
||
db.flush()
|
||
db.commit()
|
||
new_count += 1
|
||
_pass_new += 1
|
||
_stats["total_alerts"] += 1
|
||
_new_listing_links.append((listing.id, listing.link))
|
||
|
||
# ── Alert (with AI verdict if applicable) ─────────────────
|
||
_ai_line = f"🤖 AI: ✅ {_ai_reason_val}\n" if (_ai_enabled and _ai_target and _ai_reason_val) else ""
|
||
alert = (
|
||
f"🎯 <b>Ghost Node — New Hit</b>\n"
|
||
f"📦 {title[:80]}\n"
|
||
f"💰 {price_display or 'Price unknown'}"
|
||
+ (f" | ⏳ {time_left_str}" if time_left_str else "") + "\n"
|
||
+ f"🏷️ Keyword: <i>{keyword.term}</i> | Score: {score}\n"
|
||
+ _ai_line
|
||
+ f"🌐 Site: {site.name}\n"
|
||
f"🔗 {href[:200]}"
|
||
)
|
||
asyncio.create_task(send_alert(alert, subject=f"Ghost Node — {title[:40]}"))
|
||
|
||
except Exception as row_exc:
|
||
print(f"[Scraper] row parse error: {row_exc}")
|
||
continue
|
||
|
||
# Always log pass summary
|
||
_pass_elapsed = round(time.time() - _pass_start, 1)
|
||
_remaining_after = max(0, int(_hold_deadline - time.time()))
|
||
print(
|
||
f"[Scraper] ✓ Pass #{_pass_num} complete in {_pass_elapsed}s — "
|
||
f"{_pass_new} new | {new_count} total"
|
||
+ (f" | {_remaining_after}s hold remaining — re-scraping now" if delay_page_hold > 0 and _remaining_after > 0 else "")
|
||
)
|
||
|
||
# Exit: no hold timer set, or hold timer has now expired
|
||
if delay_page_hold <= 0 or time.time() >= _hold_deadline:
|
||
break
|
||
# Time remains — start next pass immediately (no idle wait)
|
||
|
||
# ── N5: Pagination — follow "Next page" up to site.max_pages ─────────
|
||
# Runs AFTER the page-hold loop so we only paginate when all passes
|
||
# on the current page are complete. Each new page resets the hold timer.
|
||
_max_pg = max(1, site.max_pages or 1)
|
||
_cur_pg = 1
|
||
while _cur_pg < _max_pg:
|
||
_went = await _go_next_page(page)
|
||
if not _went:
|
||
break
|
||
_cur_pg += 1
|
||
print(f"[Scraper] 📄 {site.name} | '{keyword.term}' → page {_cur_pg}/{_max_pg}")
|
||
|
||
# Brief human pause between pages
|
||
await asyncio.sleep(_jitter(2.5, pct=0.4))
|
||
if _hlvl not in ("raw",):
|
||
await _human_scroll(page, steps=random.randint(2, 4))
|
||
|
||
# Re-extract listing items on the new page
|
||
items_pg = []
|
||
for sel_try in listing_selectors:
|
||
items_pg = await page.query_selector_all(sel_try)
|
||
if items_pg:
|
||
break
|
||
if not items_pg:
|
||
items_pg = await page.query_selector_all(
|
||
"a[href*='itm'], a[href*='listing'], a[href*='item'], "
|
||
"a[href*='/product/'], a[href*='/lot/']"
|
||
)
|
||
|
||
# Run the same page-level extraction on the new page
|
||
try:
|
||
rows_pg = await page.evaluate(JS_EXTRACT)
|
||
except Exception:
|
||
rows_pg = []
|
||
|
||
for row in (rows_pg or [])[:50]:
|
||
try:
|
||
title = (row.get("title") or "").strip()
|
||
price_text = (row.get("price_text") or "").strip()
|
||
time_text = (row.get("time_text") or "").strip()
|
||
href = (row.get("href") or "").strip()
|
||
images_list2 = row.get("images", [])
|
||
if not title or len(title) < 5:
|
||
continue
|
||
if href and not href.startswith("http"):
|
||
from urllib.parse import urljoin
|
||
href = urljoin(page.url, href)
|
||
score = calculate_attribute_score(title, keyword.weight)
|
||
|
||
# ── N16: AI filter (pagination) ───────────────────────────
|
||
_ai_en2 = _get_config("ai_filter_enabled", "false").lower() == "true"
|
||
_ai_tgt2 = (keyword.ai_target or "").strip()
|
||
_ai_match2 = None
|
||
_ai_reason2 = None
|
||
|
||
if _ai_en2 and _ai_tgt2:
|
||
_ai_match2, _ai_reason2 = await _ai_analyze(title, _ai_tgt2)
|
||
if not _ai_match2:
|
||
_stats["total_scanned"] += 1
|
||
if not (href and db.query(Listing).filter(Listing.link == href).first()):
|
||
_amt_r, _cur_r = _extract_price_and_currency(price_text)
|
||
db.add(Listing(
|
||
title=title[:500], price=_amt_r,
|
||
currency=_cur_r[:10] if _cur_r else "",
|
||
price_raw=_format_price(_amt_r, _cur_r)[:100],
|
||
time_left=_extract_time_left(time_text)[:60],
|
||
link=href or f"no-link-pg{_cur_pg}-{random.randint(0,999999)}",
|
||
score=score, keyword=keyword.term, site_name=site.name,
|
||
ai_match=0,
|
||
ai_reason=_ai_reason2[:200] if _ai_reason2 else None,
|
||
images=json.dumps(images_list2[:10]) if images_list2 else None,
|
||
))
|
||
db.flush()
|
||
db.commit()
|
||
continue
|
||
else:
|
||
if score < 0:
|
||
continue
|
||
|
||
_stats["total_scanned"] += 1
|
||
if href and db.query(Listing).filter(Listing.link == href).first():
|
||
continue
|
||
amount, currency = _extract_price_and_currency(price_text)
|
||
price_display = _format_price(amount, currency)
|
||
time_left_str = _extract_time_left(time_text)
|
||
listing = Listing(
|
||
title=title[:500], price=amount,
|
||
currency=currency[:10] if currency else "",
|
||
price_raw=price_display[:100],
|
||
time_left=time_left_str[:60],
|
||
time_left_mins=round(timeLeftToMins(time_left_str), 4) if time_left_str and timeLeftToMins(time_left_str) != float("inf") else None,
|
||
price_updated_at=datetime.now(),
|
||
link=href or f"no-link-pg{_cur_pg}-{random.randint(0,999999)}",
|
||
score=score, keyword=keyword.term, site_name=site.name,
|
||
ai_match=1 if (_ai_en2 and _ai_tgt2) else None,
|
||
ai_reason=_ai_reason2[:200] if _ai_reason2 else None,
|
||
images=json.dumps(images_list2[:10]) if images_list2 else None,
|
||
)
|
||
db.add(listing)
|
||
db.flush()
|
||
db.commit()
|
||
new_count += 1
|
||
_stats["total_alerts"] += 1
|
||
_new_listing_links.append((listing.id, listing.link))
|
||
_tl_pg = f" | ⏳ {time_left_str}" if time_left_str else ""
|
||
_ai_line2 = f"🤖 AI: ✅ {_ai_reason2}\n" if (_ai_en2 and _ai_tgt2 and _ai_reason2) else ""
|
||
alert = (
|
||
"🎯 <b>Ghost Node — New Hit</b>"
|
||
f" (p{_cur_pg})\n"
|
||
f"📦 {title[:80]}\n"
|
||
f"💰 {price_display or 'Price unknown'}{_tl_pg}\n"
|
||
f"🏷️ Keyword: <i>{keyword.term}</i> | Score: {score}\n"
|
||
+ _ai_line2
|
||
+ f"🌐 Site: {site.name}\n"
|
||
f"🔗 {href[:200]}"
|
||
)
|
||
asyncio.create_task(send_alert(alert, subject=f"Ghost Node — {title[:40]}"))
|
||
except Exception:
|
||
continue
|
||
|
||
print(f"[Scraper] ✓ {site.name} | '{keyword.term}' page {_cur_pg} → {new_count} total new")
|
||
except Exception as nav_exc:
|
||
# Baghdad Optimization — single site failure never crashes the engine
|
||
print(f"[Scraper] ⚠️ {site.name} | {keyword.term} → {nav_exc}")
|
||
_record_site_error(site.id, str(nav_exc)[:400])
|
||
return new_count
|
||
|
||
# ── Immediate detail-page image fetch for new listings ──────────────────
|
||
# Search results pages only have thumbnails. Visit each new lot's detail
|
||
# page NOW (same browser context) to grab all images immediately instead
|
||
# of waiting for the 5-minute price refresh pass.
|
||
if _new_listing_links:
|
||
print(f"[Images] 🖼️ Fetching detail images for {len(_new_listing_links)} new listing(s)…")
|
||
try:
|
||
_img_updated = await _fetch_listing_images_batch(
|
||
page.context, _new_listing_links, db
|
||
)
|
||
print(f"[Images] ✅ {_img_updated}/{len(_new_listing_links)} listings got full images")
|
||
except Exception as img_exc:
|
||
print(f"[Images] ⚠️ Batch image fetch failed: {img_exc}")
|
||
|
||
# ── N13: Record success after clean completion ─────────────────────────────
|
||
_record_site_success(site.id)
|
||
return new_count
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# N9 — Closing-Soon Alert Loop (Thread E)
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
async def closing_alert_loop() -> None:
|
||
"""
|
||
Multi-interval closing alert loop.
|
||
Config: closing_alert_enabled (true/false)
|
||
closing_alert_schedule (comma-separated minutes, e.g. "60,30,10,5")
|
||
Use "0" to disable all closing alerts while keeping capture alerts.
|
||
Each lot can fire multiple alerts — one per configured threshold.
|
||
Tracks fired intervals in Listing.closing_alerts_sent (JSON list).
|
||
"""
|
||
print("[Thread E] Closing-alert loop online.")
|
||
while True:
|
||
try:
|
||
enabled = _get_config("closing_alert_enabled", "false").lower() == "true"
|
||
if enabled:
|
||
schedule_raw = _get_config("closing_alert_schedule", "30").strip()
|
||
# Parse thresholds — "0" means no closing alerts at all
|
||
thresholds: list[float] = []
|
||
for t in schedule_raw.split(","):
|
||
t = t.strip()
|
||
if t and t != "0":
|
||
try:
|
||
thresholds.append(float(t))
|
||
except ValueError:
|
||
pass
|
||
thresholds = sorted(set(thresholds), reverse=True) # e.g. [60, 30, 10, 5]
|
||
|
||
if thresholds:
|
||
db = SessionLocal()
|
||
try:
|
||
from datetime import timedelta
|
||
stale_cutoff = datetime.now() - timedelta(days=7)
|
||
max_threshold = max(thresholds)
|
||
candidates = (
|
||
db.query(Listing)
|
||
.filter(
|
||
Listing.time_left_mins != None,
|
||
Listing.time_left_mins > 0,
|
||
Listing.timestamp >= stale_cutoff,
|
||
)
|
||
.all()
|
||
)
|
||
for lot in candidates:
|
||
if not lot.timestamp:
|
||
continue
|
||
ref_time = lot.price_updated_at or lot.timestamp
|
||
elapsed_mins = (datetime.now() - ref_time).total_seconds() / 60.0
|
||
remaining = (lot.time_left_mins or 0) - elapsed_mins
|
||
|
||
if remaining <= 0:
|
||
# Mark all thresholds as sent to avoid re-processing
|
||
lot.closing_alerts_sent = json.dumps(thresholds)
|
||
db.flush()
|
||
db.commit()
|
||
continue
|
||
|
||
# Load which intervals have already fired
|
||
try:
|
||
fired: list = json.loads(lot.closing_alerts_sent or "[]")
|
||
except Exception:
|
||
fired = []
|
||
|
||
for threshold in thresholds:
|
||
if threshold in fired:
|
||
continue # already sent for this interval
|
||
if remaining <= threshold:
|
||
mins_int = int(remaining)
|
||
secs_int = int((remaining - mins_int) * 60)
|
||
time_str = (
|
||
f"{mins_int}m {secs_int:02d}s"
|
||
if mins_int < 60
|
||
else f"{int(remaining/60)}h {mins_int % 60}m"
|
||
)
|
||
alert = (
|
||
f"⏰ <b>CLOSING SOON — {time_str} left!</b>\n"
|
||
f"📦 {lot.title[:80]}\n"
|
||
f"💰 {lot.price_raw or 'Price unknown'}\n"
|
||
f"🏷️ Keyword: <i>{lot.keyword}</i> | Score: {lot.score}\n"
|
||
f"🌐 Site: {lot.site_name}\n"
|
||
f"🔗 {lot.link[:200]}"
|
||
)
|
||
asyncio.create_task(send_alert(alert, subject=f"CLOSING SOON — {lot.title[:40]}"))
|
||
fired.append(threshold)
|
||
lot.closing_alerts_sent = json.dumps(fired)
|
||
db.flush()
|
||
db.commit()
|
||
print(f"[Thread E] ⏰ @{threshold}min alert: {lot.title[:50]} ({time_str} left)")
|
||
finally:
|
||
db.close()
|
||
except Exception as exc:
|
||
print(f"[Thread E] Error: {exc}")
|
||
await asyncio.sleep(60)
|
||
|
||
|
||
def run_closing_alert_thread() -> None:
|
||
loop = asyncio.new_event_loop()
|
||
asyncio.set_event_loop(loop)
|
||
loop.run_until_complete(closing_alert_loop())
|
||
|
||
|
||
async def nuclear_engine() -> None:
|
||
"""
|
||
Main scraper loop — runs forever.
|
||
Pulls a FRESH copy of TargetSites + Config from the DB at the TOP of
|
||
every cycle, so any site/keyword added via the UI is immediately active.
|
||
"""
|
||
print("[Thread B] Nuclear engine igniting…")
|
||
_stats["engine_status"] = "Running"
|
||
|
||
async with async_playwright() as pw:
|
||
while True:
|
||
if _stats["engine_status"] == "Paused":
|
||
await asyncio.sleep(10)
|
||
continue
|
||
|
||
# ── N8: Scrape-window check — skip cycle outside allowed hours ────
|
||
_win_enabled = _get_config("scrape_window_enabled", "false").lower() == "true"
|
||
if _win_enabled:
|
||
_now_hour = datetime.now().hour
|
||
_start_h = int(_get_config("scrape_start_hour", "8"))
|
||
_end_h = int(_get_config("scrape_end_hour", "22"))
|
||
# Handles same-day windows (08:00–22:00) and overnight windows (22:00–06:00)
|
||
if _start_h <= _end_h:
|
||
_in_window = _start_h <= _now_hour < _end_h
|
||
else: # overnight window e.g. 22–06
|
||
_in_window = _now_hour >= _start_h or _now_hour < _end_h
|
||
if not _in_window:
|
||
print(
|
||
f"[Thread B] 🌙 Outside scrape window ({_start_h:02d}:00–{_end_h:02d}:00) "
|
||
f"— current hour {_now_hour:02d}:xx. Sleeping 5min."
|
||
)
|
||
await asyncio.sleep(300)
|
||
continue
|
||
|
||
# ── Pull live config from DB — fresh session every cycle ──────────
|
||
db = SessionLocal()
|
||
try:
|
||
keywords = db.query(Keyword).all()
|
||
# Only rows where enabled is explicitly 1.
|
||
# We also audit and log any disabled rows so the operator
|
||
# can confirm that toggling a site off actually prevents
|
||
# the engine from touching it.
|
||
target_sites = db.query(TargetSite).filter(TargetSite.enabled == 1).all()
|
||
disabled_sites = db.query(TargetSite).filter(TargetSite.enabled != 1).all()
|
||
timer_val = int(_get_config("timer", "120"))
|
||
delay_launch = int(_get_config("delay_launch", "0"))
|
||
delay_site_open = int(_get_config("delay_site_open", "0"))
|
||
delay_search = int(_get_config("delay_post_search", "0"))
|
||
delay_hold = int(_get_config("delay_page_hold", "0"))
|
||
humanize_level = _get_config("humanize_level", "heavy").strip().lower()
|
||
if humanize_level not in ("raw", "low", "medium", "heavy"):
|
||
humanize_level = "heavy"
|
||
print(f"[Thread B] 🎭 Humanize level: {humanize_level.upper()}")
|
||
finally:
|
||
db.close()
|
||
|
||
# N4: Refresh FX rates once per cycle
|
||
await _get_fx_rates()
|
||
|
||
# ── Log exactly which sites the engine will scrape this cycle ─────
|
||
if disabled_sites:
|
||
skipped = ", ".join(f"'{s.name}'" for s in disabled_sites)
|
||
print(f"[Thread B] ⏭️ Skipping disabled site(s): {skipped}")
|
||
|
||
if target_sites:
|
||
site_names = ", ".join(f"'{s.name}'" for s in target_sites)
|
||
print(f"[Thread B] 🔄 Cycle starting — {len(target_sites)} site(s): {site_names}")
|
||
else:
|
||
print("[Thread B] ⚠️ No enabled TargetSites in DB — sleeping 60s.")
|
||
await asyncio.sleep(60)
|
||
continue
|
||
|
||
if not keywords:
|
||
print("[Thread B] ⚠️ No Keywords in DB — sleeping 60s.")
|
||
await asyncio.sleep(60)
|
||
continue
|
||
|
||
_stats["engine_status"] = "Running"
|
||
cycle_start = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||
|
||
for site in target_sites:
|
||
# ── New browser context per site (stealth rotation) ───────────
|
||
try:
|
||
# Resolve browser + incognito setting fresh every cycle
|
||
_browser_label, _browser_exe = _resolve_browser()
|
||
_incognito = _get_config("incognito_mode", "false").lower() == "true"
|
||
|
||
# ── Incognito ON: ephemeral context — no cookies, no history ──
|
||
# ── Incognito OFF: persistent user-data-dir — cookies survive ──
|
||
#
|
||
# Why this matters: Playwright's browser.new_context() is
|
||
# always cookie-less (in-memory only) regardless of the
|
||
# --incognito flag. To give the OFF mode real persistence we
|
||
# use launch_persistent_context() with a user-data directory
|
||
# on disk so login sessions and cookies carry over between
|
||
# scrape cycles, making the bot look like a returning visitor.
|
||
|
||
# ── Read show_browser setting — when True the window is visible ──
|
||
_show_browser = _get_config("show_browser", "false").lower() == "true"
|
||
_headless = not _show_browser # headless=False means visible
|
||
|
||
if _show_browser:
|
||
print(
|
||
f"[Browser] 👁️ VISIBLE MODE — browser window will open on screen. "
|
||
f"Close Ghost Node or toggle off in Settings when done debugging."
|
||
)
|
||
|
||
_launch_args = [
|
||
"--no-sandbox",
|
||
|
||
# ── Background-throttling kill switches ───────────────
|
||
# These flags tell Chromium's internal scheduler and
|
||
# renderer to treat this browser exactly the same whether
|
||
# it is the foreground window, minimised, or behind other
|
||
# windows. No OS focus is ever needed.
|
||
#
|
||
# Without these flags Chromium intentionally slows down
|
||
# background tabs: JS timers fire at 1 Hz instead of
|
||
# normal rate, GPU compositing pauses, and wake-locks are
|
||
# dropped — all of which cause silent scraping failures.
|
||
|
||
# Prevents the renderer process from being deprioritised
|
||
# when the window loses OS focus or is minimised
|
||
"--disable-renderer-backgrounding",
|
||
|
||
# Prevents background tabs from having their JS timers
|
||
# throttled to 1-second intervals
|
||
"--disable-background-timer-throttling",
|
||
|
||
# Prevents Chromium from pausing rendering for windows
|
||
# that are fully hidden behind other windows
|
||
"--disable-backgrounding-occluded-windows",
|
||
|
||
# Keeps the GPU process and compositor running at full
|
||
# speed even when no visible frame is being presented
|
||
"--disable-background-networking",
|
||
|
||
# Forces Chromium to keep all tabs at the same priority
|
||
# regardless of visibility
|
||
"--force-fieldtrials=BackgroundTabStopping/disable",
|
||
]
|
||
# Pick a consistent agent profile — UA, platform, language,
|
||
# locale and timezone all match so HTTP headers agree with
|
||
# navigator properties (detectors cross-check these).
|
||
_profile = random.choice(_agent_profiles)
|
||
_vw, _vh = _profile["viewport"]
|
||
_launch_kwargs_base: dict = {
|
||
"headless": _headless,
|
||
"args": _launch_args,
|
||
"user_agent": _profile["ua"],
|
||
"viewport": {"width": _vw, "height": _vh},
|
||
"locale": _profile["locale"],
|
||
"timezone_id": _profile["tz"],
|
||
"extra_http_headers": {
|
||
"Accept-Language": _profile["lang"],
|
||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
||
},
|
||
}
|
||
if _browser_exe:
|
||
_launch_kwargs_base["executable_path"] = _browser_exe
|
||
# N1: Add proxy if enabled
|
||
_proxy = _get_proxy()
|
||
if _proxy:
|
||
_launch_kwargs_base["proxy"] = _proxy
|
||
|
||
_visibility_tag = "VISIBLE 👁" if _show_browser else "headless"
|
||
|
||
if _incognito:
|
||
# Ephemeral: fresh slate every session
|
||
browser = await pw.chromium.launch(
|
||
headless=_headless,
|
||
args=_launch_args,
|
||
**({"executable_path": _browser_exe} if _browser_exe else {}),
|
||
)
|
||
context = await browser.new_context(
|
||
user_agent=_profile["ua"],
|
||
viewport={"width": _vw, "height": _vh},
|
||
locale=_profile["locale"],
|
||
timezone_id=_profile["tz"],
|
||
)
|
||
print(f"[Browser] 🕵️ Launched {_browser_label} — INCOGNITO + {_visibility_tag}")
|
||
else:
|
||
# Persistent: per-site profile dirs so cookie jars
|
||
# don't bleed across different sites.
|
||
_site_slug = re.sub(r"[^\w]", "_", site.name.lower())[:20]
|
||
_profile_dir = os.path.join(
|
||
os.path.dirname(__file__), ".browser_profiles", _site_slug
|
||
)
|
||
os.makedirs(_profile_dir, exist_ok=True)
|
||
context = await pw.chromium.launch_persistent_context(
|
||
_profile_dir,
|
||
**_launch_kwargs_base,
|
||
)
|
||
browser = None # persistent context manages its own lifecycle
|
||
print(f"[Browser] 🚀 Launched {_browser_label} — NORMAL + {_visibility_tag}")
|
||
|
||
# ── 30-property stealth init script ──────────────────────
|
||
# Built from the selected agent profile so every property
|
||
# (UA, platform, language, WebGL renderer, canvas noise,
|
||
# audio noise, screen size, timing) is internally consistent.
|
||
await context.add_init_script(_build_stealth_script(_profile))
|
||
|
||
page = await context.new_page()
|
||
# Block heavy resources — but only after the first visit so
|
||
# the initial page load looks more natural to CDN detectors.
|
||
await page.route("**/*.{png,jpg,jpeg,gif,svg,woff,woff2,ttf,mp4,webp}",
|
||
lambda route: route.abort())
|
||
|
||
# ── Delay 1: post-launch settle time ─────────────────────
|
||
if delay_launch > 0:
|
||
print(
|
||
f"[Browser] ⏳ Post-launch delay: {delay_launch}s "
|
||
f"— browser open, waiting {delay_launch}s before first navigation"
|
||
)
|
||
await asyncio.sleep(delay_launch)
|
||
print(f"[Browser] ✅ Post-launch delay done — navigating now")
|
||
|
||
db = SessionLocal()
|
||
try:
|
||
for _kw_idx, kw in enumerate(keywords):
|
||
found = await scrape_site(
|
||
page, site, kw, db,
|
||
delay_post_search=delay_search,
|
||
delay_page_hold=delay_hold,
|
||
delay_site_open=delay_site_open,
|
||
is_first_keyword=(_kw_idx == 0),
|
||
humanize_level=humanize_level,
|
||
)
|
||
print(f"[Scraper] ✓ {site.name} | '{kw.term}' → {found} new")
|
||
# Jitter: 8–20 seconds between keywords (was 5-15)
|
||
# Randomised more aggressively so inter-keyword timing
|
||
# has no detectable pattern.
|
||
jitter = _jitter(random.uniform(8, 20), pct=0.4)
|
||
await asyncio.sleep(jitter)
|
||
finally:
|
||
db.close()
|
||
|
||
# Close the context (both modes); close browser only if not persistent
|
||
await context.close()
|
||
if browser is not None:
|
||
await browser.close()
|
||
|
||
except Exception as browser_exc:
|
||
print(f"[Thread B] Browser error on {site.name}: {browser_exc}")
|
||
|
||
_stats["last_cycle"] = cycle_start
|
||
_stats["engine_status"] = "Idle — waiting next cycle"
|
||
|
||
# ── N8: Boost mode — shorten interval when a lot closes soon ──────
|
||
# Check if any tracked lot closes within 30 min. If so, use
|
||
# boost_interval_mins instead of the normal timer so the engine
|
||
# refreshes more often during the critical closing window.
|
||
_boost_secs = int(_get_config("boost_interval_mins", "2")) * 60
|
||
_db_boost = SessionLocal()
|
||
try:
|
||
_soon_cutoff = datetime.now() + timedelta(minutes=30)
|
||
_closing_soon = _db_boost.query(Listing).filter(
|
||
Listing.time_left_mins != None,
|
||
Listing.time_left_mins <= 30,
|
||
Listing.time_left_mins > 0,
|
||
).count()
|
||
finally:
|
||
_db_boost.close()
|
||
_effective_timer = _boost_secs if _closing_soon else timer_val
|
||
_boost_label = f" [⚡ BOOST MODE — {_closing_soon} lot(s) closing soon]" if _closing_soon else ""
|
||
|
||
# Apply ±25% jitter to the cycle timer so requests never arrive
|
||
# at a perfectly predictable interval (a classic bot signature).
|
||
_sleep_actual = _jitter(_effective_timer, pct=0.25)
|
||
print(f"[Thread B] ✅ Cycle complete. Sleeping {int(_sleep_actual)}s (timer={_effective_timer}s ±25%).{_boost_label}")
|
||
# Poll every 5 s so that any API write (new keyword, site, config
|
||
# change) sets _cycle_now and the engine wakes up immediately
|
||
# instead of waiting the full inter-cycle sleep.
|
||
_slept = 0.0
|
||
_poll_interval = 5.0
|
||
while _slept < _sleep_actual:
|
||
if _cycle_now.is_set():
|
||
_cycle_now.clear()
|
||
print("[Thread B] ⚡ Change detected — skipping sleep, starting new cycle now.")
|
||
break
|
||
_chunk = min(_poll_interval, _sleep_actual - _slept)
|
||
await asyncio.sleep(_chunk)
|
||
_slept += _chunk
|
||
|
||
|
||
async def _price_refresh_pass() -> None:
|
||
"""
|
||
Single price-refresh pass — visits every saved lot page, pulls current
|
||
price + time-left, writes changes to DB.
|
||
|
||
Runs in its OWN dedicated asyncio event loop (Thread D) so it is
|
||
completely isolated from the main scraper loop (Thread B).
|
||
The two loops never share an event loop, never block each other, and
|
||
never compete for the same browser instance. SQLite handles concurrent
|
||
DB writes via its WAL journal — each function uses its own SessionLocal.
|
||
"""
|
||
db = SessionLocal()
|
||
try:
|
||
listings = db.query(Listing).filter(Listing.link.notlike("no-link-%")).all()
|
||
except Exception as exc:
|
||
print(f"[Refresh] ❌ DB read failed: {exc}")
|
||
db.close()
|
||
return
|
||
|
||
if not listings:
|
||
db.close()
|
||
return
|
||
|
||
print(f"[Refresh] 🔄 Starting price pass — {len(listings)} lot(s)…")
|
||
updated = 0
|
||
|
||
try:
|
||
async with async_playwright() as pw:
|
||
browser = await pw.chromium.launch(
|
||
headless=True,
|
||
args=[
|
||
"--no-sandbox",
|
||
"--disable-renderer-backgrounding",
|
||
"--disable-background-timer-throttling",
|
||
"--disable-backgrounding-occluded-windows",
|
||
],
|
||
)
|
||
context = await browser.new_context(
|
||
user_agent=random.choice(_rotating_agents),
|
||
locale="en-GB",
|
||
)
|
||
await context.add_init_script(
|
||
"Object.defineProperty(navigator,'webdriver',{get:()=>undefined});"
|
||
"document.hasFocus=()=>true;"
|
||
)
|
||
|
||
for listing in listings:
|
||
try:
|
||
page = await context.new_page()
|
||
await page.route(
|
||
"**/*.{png,jpg,jpeg,gif,svg,woff,woff2,ttf,mp4,webp}",
|
||
lambda route: route.abort(),
|
||
)
|
||
await page.goto(
|
||
listing.link, timeout=30_000, wait_until="domcontentloaded"
|
||
)
|
||
# ── Smart wait: Apollo cache polling (HiBid & GraphQL SPAs) ───────
|
||
_lot_id_m = re.search(r'/(?:lot|item|product)/(\d+)', listing.link, re.IGNORECASE)
|
||
_lot_id = _lot_id_m.group(1) if _lot_id_m else None
|
||
try:
|
||
await page.wait_for_function(
|
||
JS_APOLLO_WAIT, arg=_lot_id, timeout=8000, polling=200
|
||
)
|
||
except Exception:
|
||
pass
|
||
await page.wait_for_timeout(1500)
|
||
|
||
# Pull price + time from the detail page
|
||
data = await page.evaluate(r"""() => {
|
||
const PRICE_SELS = [
|
||
'[class*="current-bid"] [class*="amount"]',
|
||
'[class*="current-bid"]', '[class*="bid-amount"]',
|
||
'.s-item__price', '[itemprop="price"]',
|
||
'span[class*="price"]', '.price', '[class*="price"]',
|
||
];
|
||
const TIME_SELS = [
|
||
'[class*="time-left"]', '[class*="timeleft"]',
|
||
'[class*="countdown"]', '[class*="closing-time"]',
|
||
'[class*="time-remaining"]', '[class*="ends-in"]',
|
||
'.s-item__time-left', '[class*="expire"]',
|
||
'[class*="end-time"]', 'time',
|
||
];
|
||
const q = sels => { for (const s of sels) {
|
||
try { const el = document.querySelector(s); if (el) return el; }
|
||
catch(e) {} } return null; };
|
||
const pe = q(PRICE_SELS), te = q(TIME_SELS);
|
||
let pt = pe ? (
|
||
pe.innerText ||
|
||
pe.getAttribute('data-price') ||
|
||
pe.getAttribute('content') || ''
|
||
).trim() : '';
|
||
if (pt.includes('\n')) {
|
||
const ln = pt.split('\n').find(x => /\d/.test(x));
|
||
if (ln) pt = ln.trim();
|
||
}
|
||
return { price_text: pt, time_text: te ? te.innerText.trim() : '' };
|
||
}""")
|
||
|
||
# Pull images via shared 5-layer extractor (same as initial scrape)
|
||
img_urls = await page.evaluate(JS_DETAIL_IMAGES)
|
||
|
||
await page.close()
|
||
|
||
price_text = (data.get("price_text") or "").strip()
|
||
time_text = (data.get("time_text") or "").strip()
|
||
# img_urls already set by JS_DETAIL_IMAGES evaluate above
|
||
|
||
if not price_text and not time_text and not img_urls:
|
||
continue
|
||
|
||
amount, currency = _extract_price_and_currency(price_text)
|
||
price_display = _format_price(amount, currency)
|
||
time_left_str = _extract_time_left(time_text)
|
||
tl_mins = round(timeLeftToMins(time_left_str), 4) if time_left_str and timeLeftToMins(time_left_str) != float('inf') else None
|
||
|
||
changed = False
|
||
if amount is not None and amount != listing.price:
|
||
listing.price = amount
|
||
listing.currency = currency[:10] if currency else ""
|
||
listing.price_raw = price_display[:100]
|
||
changed = True
|
||
if time_left_str and time_left_str != listing.time_left:
|
||
listing.time_left = time_left_str[:60]
|
||
listing.time_left_mins = tl_mins
|
||
changed = True
|
||
# Update images whenever the URL set differs from what we stored —
|
||
# handles count changes (0→5, 1→5) AND quality upgrades where count
|
||
# stays the same but URLs differ (thumbnail→full-size).
|
||
# Guard: never overwrite a good set with an empty result.
|
||
if img_urls:
|
||
existing_imgs = []
|
||
try: existing_imgs = json.loads(listing.images or "[]")
|
||
except Exception: pass
|
||
if img_urls != existing_imgs:
|
||
listing.images = json.dumps(img_urls[:10])
|
||
changed = True
|
||
if changed:
|
||
listing.price_updated_at = datetime.now()
|
||
db.commit()
|
||
updated += 1
|
||
print(
|
||
f"[Refresh] ✅ {listing.title[:35]} → "
|
||
f"{price_display} | {time_left_str}"
|
||
)
|
||
|
||
except Exception as lot_exc:
|
||
print(f"[Refresh] ⚠️ {listing.link[:55]}: {lot_exc}")
|
||
continue
|
||
|
||
await browser.close()
|
||
|
||
except Exception as exc:
|
||
print(f"[Refresh] ❌ Browser error: {exc}")
|
||
finally:
|
||
db.close()
|
||
|
||
print(f"[Refresh] ✅ Pass done — {updated}/{len(listings)} updated.")
|
||
|
||
|
||
def run_scraper_thread() -> None:
|
||
"""Thread B — main scraper + Telegram C2. Never touches price refresh."""
|
||
loop = asyncio.new_event_loop()
|
||
asyncio.set_event_loop(loop)
|
||
loop.run_until_complete(
|
||
asyncio.gather(nuclear_engine(), telegram_c2_loop())
|
||
)
|
||
|
||
|
||
def run_refresh_thread() -> None:
|
||
"""
|
||
Thread D — price/time-left refresh, completely isolated from Thread B.
|
||
|
||
Runs its own asyncio event loop so it never competes with the scraper
|
||
for the event loop, browser instances, or DB connections.
|
||
SQLite WAL mode handles concurrent writes from both threads safely.
|
||
Wakes every 5 minutes, runs a full refresh pass, then sleeps again.
|
||
If the pass takes longer than 5 min (large listing table) the next
|
||
pass starts immediately after — no overlap possible.
|
||
"""
|
||
import asyncio as _aio
|
||
|
||
async def _loop():
|
||
INTERVAL = 300 # 5 minutes
|
||
print("[Thread D] 💰 Price-refresh thread online.")
|
||
while True:
|
||
await _aio.sleep(INTERVAL)
|
||
try:
|
||
await _price_refresh_pass()
|
||
except Exception as exc:
|
||
print(f"[Thread D] ❌ Unhandled error in refresh pass: {exc}")
|
||
|
||
loop = _aio.new_event_loop()
|
||
_aio.set_event_loop(loop)
|
||
loop.run_until_complete(_loop())
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Thread A — FastAPI Dashboard
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
app = FastAPI(title="Ghost Node", version="1.0.0")
|
||
|
||
app.add_middleware(
|
||
CORSMiddleware,
|
||
allow_origins=["*"],
|
||
allow_methods=["*"],
|
||
allow_headers=["*"],
|
||
)
|
||
|
||
|
||
# ── Static files / Dashboard ─────────────────────────────────────────────────
|
||
|
||
DASHBOARD_PATH = os.path.join(os.path.dirname(__file__), "dashboard.html")
|
||
|
||
@app.get("/", response_class=HTMLResponse)
|
||
async def serve_dashboard():
|
||
# Prefer Next.js static build when it exists; fall back to legacy dashboard.html
|
||
_next_index = os.path.join(os.path.dirname(__file__), "frontend", "out", "index.html")
|
||
if os.path.exists(_next_index):
|
||
with open(_next_index, "r", encoding="utf-8") as f:
|
||
return HTMLResponse(content=f.read())
|
||
if os.path.exists(DASHBOARD_PATH):
|
||
with open(DASHBOARD_PATH, "r", encoding="utf-8") as f:
|
||
return HTMLResponse(content=f.read())
|
||
return HTMLResponse("<h1>Dashboard not found</h1>", status_code=404)
|
||
|
||
@app.get("/legacy", response_class=HTMLResponse)
|
||
async def serve_legacy():
|
||
"""Always serves the original dashboard.html regardless of Next.js build."""
|
||
if os.path.exists(DASHBOARD_PATH):
|
||
with open(DASHBOARD_PATH, "r", encoding="utf-8") as f:
|
||
return HTMLResponse(content=f.read())
|
||
return HTMLResponse("<h1>Legacy dashboard not found</h1>", status_code=404)
|
||
|
||
|
||
# ── Stats ────────────────────────────────────────────────────────────────────
|
||
|
||
@app.get("/api/stats")
|
||
def get_stats():
|
||
uptime = int(time.time() - _stats["uptime_start"])
|
||
return {**_stats, "uptime_seconds": uptime}
|
||
|
||
|
||
# ── Listings ─────────────────────────────────────────────────────────────────
|
||
|
||
@app.get("/api/listings")
|
||
def get_listings(limit: int = 100, db: Session = Depends(get_db)):
|
||
rows = (
|
||
db.query(Listing)
|
||
.order_by(Listing.timestamp.desc())
|
||
.limit(limit)
|
||
.all()
|
||
)
|
||
return [r.to_dict() for r in rows]
|
||
|
||
|
||
@app.delete("/api/listings/{listing_id}")
|
||
def delete_listing(listing_id: int, db: Session = Depends(get_db)):
|
||
row = db.query(Listing).filter(Listing.id == listing_id).first()
|
||
if not row:
|
||
return JSONResponse({"error": "not found"}, status_code=404)
|
||
db.delete(row)
|
||
db.commit()
|
||
return {"status": "deleted"}
|
||
|
||
|
||
@app.delete("/api/listings")
|
||
def clear_listings(db: Session = Depends(get_db)):
|
||
db.query(Listing).delete()
|
||
db.commit()
|
||
return {"status": "cleared"}
|
||
|
||
|
||
@app.get("/api/listings/countdown-sync")
|
||
def countdown_sync(db: Session = Depends(get_db)):
|
||
"""
|
||
Lightweight endpoint polled every 60s by the dashboard countdown ticker.
|
||
Returns only the fields needed to keep the live countdown accurate:
|
||
- id
|
||
- time_left_mins (float, updated by price-refresh Thread D)
|
||
- price_updated_at (ISO string — the reference time time_left_mins was measured)
|
||
- timestamp (fallback reference if price_updated_at is null)
|
||
|
||
Much cheaper than /api/listings since it skips title/price/link/score/etc.
|
||
The frontend uses this to silently patch data-tlmins and data-captured on
|
||
each .tl-cell without triggering a full table re-render.
|
||
"""
|
||
rows = db.query(
|
||
Listing.id,
|
||
Listing.time_left_mins,
|
||
Listing.price_updated_at,
|
||
Listing.timestamp,
|
||
).all()
|
||
return [
|
||
{
|
||
"id": r.id,
|
||
"time_left_mins": r.time_left_mins,
|
||
"price_updated_at": r.price_updated_at.isoformat() if r.price_updated_at else None,
|
||
"timestamp": r.timestamp.isoformat() if r.timestamp else None,
|
||
}
|
||
for r in rows
|
||
]
|
||
|
||
|
||
@app.get("/api/listings/refresh-status")
|
||
def get_refresh_status(db: Session = Depends(get_db)):
|
||
"""
|
||
Returns the most recent price_updated_at across all listings.
|
||
The dashboard polls this every 30s and re-fetches /api/listings
|
||
when the timestamp changes — so price updates appear automatically.
|
||
"""
|
||
from sqlalchemy import func as sqlfunc
|
||
latest = db.query(sqlfunc.max(Listing.price_updated_at)).scalar()
|
||
return {
|
||
"last_price_update": latest.isoformat() if latest else None,
|
||
"listing_count": db.query(Listing).count(),
|
||
}
|
||
|
||
|
||
# ── Keywords ─────────────────────────────────────────────────────────────────
|
||
|
||
@app.get("/api/keywords")
|
||
def get_keywords(db: Session = Depends(get_db)):
|
||
return [k.to_dict() for k in db.query(Keyword).order_by(Keyword.sort_order.asc(), Keyword.id.asc()).all()]
|
||
|
||
|
||
@app.post("/api/keywords")
|
||
async def add_keyword(request: Request, db: Session = Depends(get_db)):
|
||
body = await request.json()
|
||
term = str(body.get("term", "")).strip()
|
||
weight = int(body.get("weight", 1))
|
||
if not term:
|
||
return JSONResponse({"error": "term required"}, status_code=400)
|
||
existing = db.query(Keyword).filter(Keyword.term == term).first()
|
||
if existing:
|
||
return JSONResponse({"error": "duplicate"}, status_code=409)
|
||
# assign sort_order = max + 1 so new keyword goes to the bottom
|
||
max_order = db.query(Keyword).count()
|
||
kw = Keyword(term=term, weight=weight, sort_order=max_order)
|
||
db.add(kw)
|
||
db.commit()
|
||
db.refresh(kw)
|
||
_cycle_now.set() # wake scraper immediately
|
||
return kw.to_dict()
|
||
|
||
|
||
@app.put("/api/keywords/{kw_id}")
|
||
async def update_keyword(kw_id: int, request: Request, db: Session = Depends(get_db)):
|
||
"""Update keyword term, weight, ai_target, min_price, max_price, and/or sort_order."""
|
||
row = db.query(Keyword).filter(Keyword.id == kw_id).first()
|
||
if not row:
|
||
return JSONResponse({"error": "not found"}, status_code=404)
|
||
body = await request.json()
|
||
if "term" in body:
|
||
new_term = str(body["term"]).strip()
|
||
if new_term and new_term != row.term:
|
||
conflict = db.query(Keyword).filter(Keyword.term == new_term, Keyword.id != kw_id).first()
|
||
if conflict:
|
||
return JSONResponse({"error": "duplicate term"}, status_code=409)
|
||
row.term = new_term
|
||
if "weight" in body:
|
||
row.weight = max(1, int(body["weight"] or 1))
|
||
if "ai_target" in body:
|
||
row.ai_target = str(body["ai_target"]).strip() or None
|
||
if "min_price" in body:
|
||
v = body["min_price"]
|
||
row.min_price = float(v) if v not in (None, "", "null") else None
|
||
if "max_price" in body:
|
||
v = body["max_price"]
|
||
row.max_price = float(v) if v not in (None, "", "null") else None
|
||
if "sort_order" in body:
|
||
row.sort_order = int(body["sort_order"])
|
||
db.flush()
|
||
db.commit()
|
||
db.refresh(row)
|
||
_cycle_now.set() # wake scraper immediately
|
||
return row.to_dict()
|
||
|
||
|
||
@app.post("/api/keywords/reorder")
|
||
async def reorder_keywords(request: Request, db: Session = Depends(get_db)):
|
||
"""Accepts {order: [id, id, ...]} and bulk-updates sort_order."""
|
||
body = await request.json()
|
||
ids = body.get("order", [])
|
||
for idx, kw_id in enumerate(ids):
|
||
db.query(Keyword).filter(Keyword.id == kw_id).update({"sort_order": idx})
|
||
db.flush()
|
||
db.commit()
|
||
return {"status": "reordered"}
|
||
|
||
|
||
@app.delete("/api/keywords/{kw_id}")
|
||
def delete_keyword(kw_id: int, db: Session = Depends(get_db)):
|
||
row = db.query(Keyword).filter(Keyword.id == kw_id).first()
|
||
if not row:
|
||
return JSONResponse({"error": "not found"}, status_code=404)
|
||
db.delete(row)
|
||
db.commit()
|
||
_cycle_now.set() # wake scraper immediately
|
||
return {"status": "deleted"}
|
||
|
||
|
||
# ── N6: Scoring Rules ─────────────────────────────────────────────────────────
|
||
|
||
@app.get("/api/scoring-rules")
|
||
def get_scoring_rules(db: Session = Depends(get_db)):
|
||
return [r.to_dict() for r in db.query(ScoringRule).order_by(ScoringRule.id.asc()).all()]
|
||
|
||
|
||
@app.post("/api/scoring-rules")
|
||
async def create_scoring_rule(request: Request, db: Session = Depends(get_db)):
|
||
body = await request.json()
|
||
signal = (body.get("signal") or "").strip()
|
||
delta = body.get("delta")
|
||
if not signal or delta is None:
|
||
return JSONResponse({"error": "signal and delta required"}, status_code=400)
|
||
if db.query(ScoringRule).filter(ScoringRule.signal.ilike(signal)).first():
|
||
return JSONResponse({"error": "duplicate signal"}, status_code=409)
|
||
rule = ScoringRule(
|
||
signal=signal[:100],
|
||
delta=int(delta),
|
||
category="positive" if int(delta) > 0 else "negative",
|
||
notes=(body.get("notes") or "").strip() or None,
|
||
)
|
||
db.add(rule)
|
||
db.flush()
|
||
db.commit()
|
||
return rule.to_dict()
|
||
|
||
|
||
@app.put("/api/scoring-rules/{rule_id}")
|
||
async def update_scoring_rule(rule_id: int, request: Request, db: Session = Depends(get_db)):
|
||
row = db.query(ScoringRule).filter(ScoringRule.id == rule_id).first()
|
||
if not row:
|
||
return JSONResponse({"error": "not found"}, status_code=404)
|
||
body = await request.json()
|
||
if "signal" in body:
|
||
row.signal = body["signal"].strip()[:100]
|
||
if "delta" in body:
|
||
row.delta = int(body["delta"])
|
||
row.category = "positive" if row.delta > 0 else "negative"
|
||
if "notes" in body:
|
||
row.notes = (body["notes"] or "").strip() or None
|
||
db.flush()
|
||
db.commit()
|
||
return row.to_dict()
|
||
|
||
|
||
@app.delete("/api/scoring-rules/{rule_id}")
|
||
def delete_scoring_rule(rule_id: int, db: Session = Depends(get_db)):
|
||
row = db.query(ScoringRule).filter(ScoringRule.id == rule_id).first()
|
||
if not row:
|
||
return JSONResponse({"error": "not found"}, status_code=404)
|
||
db.delete(row)
|
||
db.commit()
|
||
return {"status": "deleted"}
|
||
|
||
|
||
# ── Target Sites ─────────────────────────────────────────────────────────────
|
||
|
||
@app.get("/api/sites")
|
||
def get_sites(db: Session = Depends(get_db)):
|
||
return [s.to_dict() for s in db.query(TargetSite).order_by(TargetSite.sort_order.asc(), TargetSite.id.asc()).all()]
|
||
|
||
|
||
@app.post("/api/sites/reorder")
|
||
async def reorder_sites(request: Request, db: Session = Depends(get_db)):
|
||
"""Accepts {order: [id, id, ...]} and bulk-updates sort_order."""
|
||
body = await request.json()
|
||
ids = body.get("order", [])
|
||
for idx, site_id in enumerate(ids):
|
||
db.query(TargetSite).filter(TargetSite.id == site_id).update({"sort_order": idx})
|
||
db.flush()
|
||
db.commit()
|
||
return {"status": "reordered"}
|
||
|
||
|
||
@app.post("/api/sites")
|
||
async def add_site(request: Request, db: Session = Depends(get_db)):
|
||
"""
|
||
Registers a new TargetSite.
|
||
Uses request.json() directly (no Pydantic model) to prevent 400 errors.
|
||
Sets enabled=1 explicitly — never relies on column default under concurrent load.
|
||
Calls db.flush() before db.commit() to force the INSERT into the SQLite
|
||
WAL immediately, so the scraper thread's next DB session sees the new row.
|
||
|
||
URL mode is inferred automatically:
|
||
Mode A — Direct: url_template contains {keyword}
|
||
→ scraper substitutes keyword and navigates directly.
|
||
Mode B — Homepage: url_template has NO {keyword}
|
||
→ scraper navigates to the URL then types in the
|
||
search box identified by search_selector.
|
||
search_selector is required in this mode.
|
||
"""
|
||
try:
|
||
body = await request.json()
|
||
except Exception:
|
||
return JSONResponse({"error": "invalid JSON body"}, status_code=400)
|
||
|
||
name = str(body.get("name", "")).strip()
|
||
template = str(body.get("url_template", "")).strip()
|
||
selector = str(body.get("search_selector", "")).strip()
|
||
|
||
if not name or not template:
|
||
return JSONResponse({"error": "name and url_template are required"}, status_code=400)
|
||
|
||
# ── Infer navigation mode and warn (not reject) for homepage mode ────────
|
||
is_direct_mode = "{keyword}" in template
|
||
if is_direct_mode:
|
||
mode_label = "DIRECT (keyword substitution)"
|
||
else:
|
||
mode_label = "HOMEPAGE (search-box interaction)"
|
||
if not selector:
|
||
# Warn but still save — operator can add selector via PUT later
|
||
print(
|
||
f"[API] ⚠️ Site '{name}' saved in HOMEPAGE mode but "
|
||
f"search_selector is empty. Add a CSS selector "
|
||
f"(e.g. 'input#st') via the Target Sites tab or the "
|
||
f"scraper will skip this site until one is provided."
|
||
)
|
||
|
||
max_pages = max(1, int(body.get("max_pages", 1) or 1))
|
||
requires_login = bool(body.get("requires_login", False))
|
||
login_url = str(body.get("login_url", "") or "").strip()
|
||
login_check = str(body.get("login_check_selector", "") or "").strip()
|
||
login_enabled = bool(body.get("login_enabled", requires_login))
|
||
max_order = db.query(TargetSite).count() # new site goes to the bottom
|
||
|
||
site = TargetSite(
|
||
name=name,
|
||
url_template=template,
|
||
search_selector=selector,
|
||
enabled=1, # explicit — never rely on column default for critical flag
|
||
max_pages=max_pages,
|
||
sort_order=max_order,
|
||
requires_login=requires_login,
|
||
login_url=login_url,
|
||
login_check_selector=login_check,
|
||
login_enabled=login_enabled,
|
||
)
|
||
db.add(site)
|
||
db.flush() # pushes INSERT to SQLite WAL before commit
|
||
db.commit()
|
||
db.refresh(site)
|
||
|
||
print(f"[API] ✅ New TargetSite saved: '{site.name}' id={site.id} "
|
||
f"mode={mode_label} pages={max_pages} login={requires_login}")
|
||
|
||
# Auto-adapt: if enabled, kick off AI selector generation immediately for new site
|
||
if _get_config("auto_adapt_enabled", "false").lower() == "true":
|
||
asyncio.create_task(adapt_site_now(site.id))
|
||
print(f"[AutoAdapt] 🆕 New site '{site.name}' — auto-adapt queued.")
|
||
|
||
_cycle_now.set() # wake scraper immediately
|
||
return site.to_dict()
|
||
|
||
|
||
@app.put("/api/sites/{site_id}")
|
||
async def update_site(site_id: int, request: Request, db: Session = Depends(get_db)):
|
||
"""
|
||
Updates a TargetSite row.
|
||
Coerces 'enabled' to a plain integer (1 or 0) regardless of whether the
|
||
dashboard sends a JSON boolean (true/false) or integer (1/0) — SQLite
|
||
stores INTEGER and the filter TargetSite.enabled == 1 must see an int.
|
||
db.flush() is called before db.commit() to push the UPDATE into the
|
||
SQLite WAL immediately, closing the race window where the scraper thread
|
||
could open a new session and read stale 'enabled' values.
|
||
"""
|
||
try:
|
||
body = await request.json()
|
||
except Exception:
|
||
return JSONResponse({"error": "invalid JSON body"}, status_code=400)
|
||
|
||
row = db.query(TargetSite).filter(TargetSite.id == site_id).first()
|
||
if not row:
|
||
return JSONResponse({"error": "not found"}, status_code=404)
|
||
|
||
for field in ("name", "url_template", "search_selector"):
|
||
if field in body:
|
||
setattr(row, field, body[field])
|
||
|
||
# ── Coerce 'enabled' to plain int so SQLite stores 1 or 0, never True/False ──
|
||
if "enabled" in body:
|
||
row.enabled = 1 if body["enabled"] else 0
|
||
|
||
# ── max_pages and login fields ────────────────────────────────────────────
|
||
if "max_pages" in body:
|
||
row.max_pages = max(1, int(body["max_pages"] or 1))
|
||
for field in ("requires_login", "login_enabled"):
|
||
if field in body:
|
||
setattr(row, field, bool(body[field]))
|
||
for field in ("login_url", "login_check_selector"):
|
||
if field in body:
|
||
setattr(row, field, str(body[field] or "").strip())
|
||
|
||
db.flush() # ← pushes UPDATE to WAL; scraper thread sees it immediately
|
||
db.commit()
|
||
db.refresh(row)
|
||
|
||
status = "ENABLED ✅" if row.enabled == 1 else "DISABLED ⏸"
|
||
print(f"[API] ✅ Site '{row.name}' (id={site_id}) → {status}")
|
||
_cycle_now.set() # wake scraper immediately
|
||
return row.to_dict()
|
||
|
||
|
||
@app.delete("/api/sites/{site_id}")
|
||
def delete_site(site_id: int, db: Session = Depends(get_db)):
|
||
row = db.query(TargetSite).filter(TargetSite.id == site_id).first()
|
||
if not row:
|
||
return JSONResponse({"error": "not found"}, status_code=404)
|
||
db.delete(row)
|
||
db.commit()
|
||
_cycle_now.set() # wake scraper immediately
|
||
return {"status": "deleted"}
|
||
|
||
|
||
# ── Config / Settings ─────────────────────────────────────────────────────────
|
||
|
||
@app.get("/api/config")
|
||
def get_config(db: Session = Depends(get_db)):
|
||
rows = db.query(Config).all()
|
||
return {r.key: r.value for r in rows}
|
||
|
||
|
||
@app.post("/api/config")
|
||
async def save_config(request: Request, db: Session = Depends(get_db)):
|
||
"""
|
||
Accepts a JSON dict of key→value pairs and upserts into the Config table.
|
||
Uses request.json() directly to avoid Pydantic 400 errors.
|
||
db.flush() forces the UPDATEs/INSERTs into the SQLite WAL before commit,
|
||
ensuring the scraper thread's next _get_config() call sees fresh values.
|
||
"""
|
||
try:
|
||
body = await request.json()
|
||
except Exception:
|
||
return JSONResponse({"error": "invalid JSON body"}, status_code=400)
|
||
|
||
saved_keys: list[str] = []
|
||
for key, value in body.items():
|
||
row = db.query(Config).filter(Config.key == key).first()
|
||
if row:
|
||
row.value = str(value)
|
||
else:
|
||
db.add(Config(key=key, value=str(value)))
|
||
saved_keys.append(key)
|
||
|
||
db.flush() # push dirty rows to SQLite WAL
|
||
db.commit() # finalise the transaction on disk
|
||
|
||
# Terminal confirmation — proves the write happened
|
||
print(f"[API] ✅ Config saved to DB: {saved_keys}")
|
||
for k in saved_keys:
|
||
row = db.query(Config).filter(Config.key == k).first()
|
||
display = row.value[:6] + "…" if row and row.value and len(row.value) > 6 else (row.value if row else "")
|
||
print(f" {k} = {display!r}")
|
||
|
||
_cycle_now.set() # wake scraper immediately
|
||
return {"status": "saved", "keys": saved_keys}
|
||
|
||
|
||
# ── N16: AI Test Endpoint ──────────────────────────────────────────────────────
|
||
|
||
@app.post("/api/ai/test")
|
||
async def ai_test(request: Request):
|
||
"""
|
||
Test the AI filter with a sample title and target.
|
||
Body: {"title": "...", "ai_target": "..."}
|
||
Returns: {"match": bool, "reason": "..."}
|
||
"""
|
||
try:
|
||
body = await request.json()
|
||
except Exception:
|
||
return JSONResponse({"error": "invalid JSON"}, status_code=400)
|
||
|
||
title = str(body.get("title", "")).strip()
|
||
ai_target = str(body.get("ai_target", "")).strip()
|
||
|
||
if not title or not ai_target:
|
||
return JSONResponse({"error": "title and ai_target required"}, status_code=400)
|
||
|
||
provider = _get_config("ai_provider", "groq").strip().lower()
|
||
if provider == "none":
|
||
return {"match": True, "reason": "AI provider is set to none — filter disabled."}
|
||
|
||
match, reason = await _ai_analyze(title, ai_target)
|
||
return {"match": match, "reason": reason, "provider": provider}
|
||
|
||
|
||
@app.get("/api/ai/debug/log")
|
||
def ai_debug_log(limit: int = 200, since_id: int = 0):
|
||
"""
|
||
Return the in-memory AI debug log (newest entries last).
|
||
- limit: max entries to return (default 200, max 300)
|
||
- since_id: only return entries with id > since_id (for polling — pass the last
|
||
id you received to get only new entries since then)
|
||
Requires ai_debug = true in config to produce entries; always returns the
|
||
current buffer regardless.
|
||
"""
|
||
with _ai_debug_log_lock:
|
||
entries = list(_ai_debug_log)
|
||
if since_id > 0:
|
||
entries = [e for e in entries if e.get("id", 0) > since_id]
|
||
entries = entries[-min(limit, 300):]
|
||
debug_on = _ai_debug_enabled()
|
||
return {
|
||
"debug_enabled": debug_on,
|
||
"total_in_buffer": len(list(_ai_debug_log)),
|
||
"entries": entries,
|
||
}
|
||
|
||
|
||
@app.delete("/api/ai/debug/log")
|
||
def ai_debug_log_clear():
|
||
"""Clear the in-memory AI debug log buffer."""
|
||
with _ai_debug_log_lock:
|
||
_ai_debug_log.clear()
|
||
return {"status": "ok", "message": "AI debug log cleared."}
|
||
|
||
|
||
# ── Engine Control ─────────────────────────────────────────────────────────────
|
||
|
||
@app.post("/api/engine/pause")
|
||
def engine_pause():
|
||
_stats["engine_status"] = "Paused"
|
||
return {"status": "paused"}
|
||
|
||
|
||
@app.post("/api/engine/resume")
|
||
def engine_resume():
|
||
_stats["engine_status"] = "Running"
|
||
return {"status": "running"}
|
||
|
||
|
||
@app.post("/api/engine/restart")
|
||
def engine_restart():
|
||
"""
|
||
Cross-platform hard restart.
|
||
|
||
Strategy:
|
||
1. Respond HTTP 200 immediately so the client gets a clean response.
|
||
2. A daemon thread waits 1 second (lets uvicorn flush the response),
|
||
then spawns a brand-new Python process running the same script with
|
||
the same arguments via subprocess.Popen.
|
||
3. After spawning the child, the current process calls os._exit(0) to
|
||
terminate itself immediately and release port 8000.
|
||
|
||
Why not os.execv?
|
||
os.execv works on Linux but on Windows it does NOT replace the current
|
||
process — it creates a new one while the old one keeps running, which
|
||
causes an "address already in use" error on port 8000.
|
||
|
||
Why subprocess.Popen + os._exit(0)?
|
||
Popen detaches the child before the parent exits, so the child is
|
||
never left as an orphan. os._exit(0) bypasses Python's atexit hooks
|
||
and __del__ finalizers which can deadlock when uvicorn is still
|
||
running threads.
|
||
"""
|
||
import threading, subprocess
|
||
|
||
def _do_restart() -> None:
|
||
time.sleep(1.0) # give uvicorn time to flush the HTTP response
|
||
try:
|
||
print("[GhostNode] 🔄 Spawning new process…")
|
||
# Inherit stdout/stderr so the new process logs to the same terminal
|
||
subprocess.Popen(
|
||
[sys.executable] + sys.argv,
|
||
stdout=None,
|
||
stderr=None,
|
||
close_fds=True,
|
||
)
|
||
print("[GhostNode] ✅ New process launched — shutting down this instance.")
|
||
except Exception as exc:
|
||
print(f"[GhostNode] ❌ Restart failed: {exc}")
|
||
return
|
||
# Kill this process immediately — port 8000 is now free for the child
|
||
os._exit(0)
|
||
|
||
threading.Thread(target=_do_restart, daemon=True, name="GhostNode-Restart").start()
|
||
return {
|
||
"status": "restarting",
|
||
"message": "New process spawning — this instance will exit in ~1 second.",
|
||
}
|
||
|
||
|
||
@app.post("/api/engine/kill")
|
||
def engine_kill():
|
||
"""
|
||
Hard-kill Ghost Node immediately — no restart, no respawn.
|
||
|
||
Sends HTTP 200 first, then a daemon thread calls os._exit(0) after
|
||
a 300 ms flush window. The entire process dies: all threads, the
|
||
scraper, the Telegram C2 loop, and uvicorn.
|
||
|
||
The dashboard will go offline and will NOT reconnect automatically.
|
||
The user must restart manually from the terminal.
|
||
"""
|
||
def _do_kill() -> None:
|
||
time.sleep(0.3) # let uvicorn flush the response
|
||
print("[GhostNode] ☠ KILL signal received — terminating process.")
|
||
os._exit(0)
|
||
|
||
threading.Thread(target=_do_kill, daemon=True, name="GhostNode-Kill").start()
|
||
return {"status": "killed", "message": "Process terminating in ~300ms."}
|
||
|
||
|
||
|
||
|
||
|
||
# ── Telegram connectivity test ────────────────────────────────────────────────
|
||
|
||
@app.post("/api/telegram/test")
|
||
async def test_telegram():
|
||
"""
|
||
Sends a test message using whatever token/chat_id is currently in the DB.
|
||
Returns the full Telegram response body so you can diagnose 401/404 etc.
|
||
"""
|
||
token = _get_config("telegram_token")
|
||
chat_id = _get_config("telegram_chat_id")
|
||
|
||
if not token or not chat_id:
|
||
return JSONResponse(
|
||
{"ok": False, "error": "No token or chat_id saved in DB. Open Settings tab and save first."},
|
||
status_code=400,
|
||
)
|
||
|
||
url = f"https://api.telegram.org/bot{token}/sendMessage"
|
||
try:
|
||
async with httpx.AsyncClient(timeout=15) as client:
|
||
r = await client.post(
|
||
url,
|
||
data={"chat_id": chat_id, "text": "👻 Ghost Node — Telegram test OK!", "parse_mode": "HTML"},
|
||
)
|
||
body = r.json()
|
||
if r.status_code == 200:
|
||
return {"ok": True, "telegram_response": body}
|
||
else:
|
||
return JSONResponse(
|
||
{"ok": False, "http_status": r.status_code, "telegram_response": body},
|
||
status_code=200, # return 200 to JS — the Telegram error is in the body
|
||
)
|
||
except Exception as exc:
|
||
return JSONResponse({"ok": False, "error": str(exc)}, status_code=500)
|
||
|
||
|
||
# ── DB read-back diagnostic ───────────────────────────────────────────────────
|
||
|
||
# ── N14 — Login trigger endpoint ─────────────────────────────────────────────
|
||
|
||
@app.post("/api/sites/{site_id}/login")
|
||
async def trigger_login(site_id: int, db: Session = Depends(get_db)):
|
||
"""
|
||
Opens a VISIBLE browser window on the site's login_url so the user can
|
||
manually log in. The session is saved to the persistent profile for that
|
||
site and reused by the scraper on all future cycles.
|
||
|
||
Only works when login_enabled = true for this site.
|
||
Returns immediately — the browser window stays open for the user to log in.
|
||
"""
|
||
site = db.query(TargetSite).filter(TargetSite.id == site_id).first()
|
||
if not site:
|
||
return JSONResponse({"error": "site not found"}, status_code=404)
|
||
if not site.login_enabled:
|
||
return JSONResponse({"error": "login_enabled is false for this site"}, status_code=400)
|
||
if not site.login_url:
|
||
return JSONResponse({"error": "No login_url configured for this site"}, status_code=400)
|
||
|
||
import re as _re2
|
||
site_slug = _re2.sub(r"[^a-z0-9]", "_", site.name.lower())[:40]
|
||
profile_dir = os.path.join(os.path.dirname(__file__), ".browser_profiles", site_slug)
|
||
os.makedirs(profile_dir, exist_ok=True)
|
||
|
||
async def _open_login_browser():
|
||
from playwright.async_api import async_playwright
|
||
async with async_playwright() as pw:
|
||
try:
|
||
_lbl, _exe = _resolve_browser()
|
||
ctx = await pw.chromium.launch_persistent_context(
|
||
profile_dir,
|
||
executable_path=_exe or None,
|
||
headless=False, # MUST be visible so user can log in
|
||
args=["--no-sandbox"],
|
||
)
|
||
page = await ctx.new_page()
|
||
await page.goto(site.login_url, timeout=60_000, wait_until="domcontentloaded")
|
||
print(f"[Login] 🔑 Browser open for {site.name} — log in and close when done.")
|
||
# Wait up to 10 minutes for the user to log in and close
|
||
await ctx.wait_for_event("close", timeout=600_000)
|
||
print(f"[Login] ✅ Session saved for {site.name}.")
|
||
except Exception as exc:
|
||
print(f"[Login] ❌ {exc}")
|
||
|
||
# Run in background — don't block the API response
|
||
asyncio.create_task(_open_login_browser())
|
||
return {
|
||
"status": "browser_opening",
|
||
"message": f"A visible browser window is opening for {site.name}. Log in and close it when done — the session will be saved automatically.",
|
||
"login_url": site.login_url,
|
||
"profile_dir": profile_dir,
|
||
}
|
||
|
||
|
||
# ── N17 — Auto-Adapter endpoints ─────────────────────────────────────────────
|
||
|
||
@app.post("/api/sites/{site_id}/adapt")
|
||
async def trigger_adapt(site_id: int, db: Session = Depends(get_db)):
|
||
"""
|
||
Trigger AI selector generation for a site.
|
||
Launches a temporary browser, scrapes the site, sends cleaned HTML to
|
||
Groq (online) or Ollama (local) for CSS selector generation, validates
|
||
live, and persists to the site_selectors table.
|
||
Returns immediately with a task status; full result logged to console.
|
||
"""
|
||
site = db.query(TargetSite).filter(TargetSite.id == site_id).first()
|
||
if not site:
|
||
return JSONResponse({"error": "site not found"}, status_code=404)
|
||
|
||
cfg = {r.key: r.value for r in db.query(Config).all()}
|
||
provider = cfg.get("ai_provider", "groq")
|
||
if provider == "none":
|
||
return JSONResponse({"error": "AI provider is set to 'none'. Enable Groq or Ollama in Settings."}, status_code=400)
|
||
|
||
# NOTE: auto_adapt_enabled only gates *automatic* adaptation on new site creation.
|
||
# Manual adaptation via the 🤖 ADAPT button is always permitted if an AI provider
|
||
# is configured — do NOT gate it on auto_adapt_enabled.
|
||
|
||
async def _run():
|
||
result = await adapt_site_now(site_id)
|
||
confidence = result.get("confidence", 0)
|
||
status = "✅" if confidence >= 50 else "⚠️"
|
||
print(f"[AutoAdapt] {status} Manual adapt for {site.name} done — confidence {confidence:.1f}")
|
||
|
||
asyncio.create_task(_run())
|
||
return {
|
||
"status": "adapting",
|
||
"message": f"AI selector generation started for '{site.name}'. Check console for progress. Reload the Sites tab in ~30s to see the result.",
|
||
"site_id": site_id,
|
||
"provider": provider,
|
||
}
|
||
|
||
|
||
@app.get("/api/sites/{site_id}/selectors")
|
||
def get_site_selectors(site_id: int, db: Session = Depends(get_db)):
|
||
"""Return the stored AI selectors for a site, if any."""
|
||
site = db.query(TargetSite).filter(TargetSite.id == site_id).first()
|
||
if not site:
|
||
return JSONResponse({"error": "site not found"}, status_code=404)
|
||
ss = db.query(SiteSelectors).filter(SiteSelectors.site_id == site_id).first()
|
||
if not ss:
|
||
return {"site_id": site_id, "site_name": site.name, "selectors": None}
|
||
return {"site_id": site_id, "site_name": site.name, "selectors": ss.to_dict()}
|
||
|
||
|
||
@app.delete("/api/sites/{site_id}/selectors")
|
||
def delete_site_selectors(site_id: int, db: Session = Depends(get_db)):
|
||
"""Delete stored AI selectors for a site (forces re-adaptation on next cycle)."""
|
||
site = db.query(TargetSite).filter(TargetSite.id == site_id).first()
|
||
if not site:
|
||
return JSONResponse({"error": "site not found"}, status_code=404)
|
||
ss = db.query(SiteSelectors).filter(SiteSelectors.site_id == site_id).first()
|
||
if not ss:
|
||
return {"status": "ok", "message": "No selectors stored for this site."}
|
||
db.flush()
|
||
db.delete(ss)
|
||
db.commit()
|
||
return {"status": "ok", "message": f"Selectors for '{site.name}' deleted. Site will use universal extractor until re-adapted."}
|
||
|
||
|
||
# ── N15 — Export endpoints ────────────────────────────────────────────────────
|
||
|
||
import csv, json as _json
|
||
from io import StringIO
|
||
from fastapi.responses import StreamingResponse
|
||
|
||
@app.get("/api/export/csv")
|
||
def export_csv(limit: int = 10000, db: Session = Depends(get_db)):
|
||
"""Export all listings to a CSV file download."""
|
||
rows = db.query(Listing).order_by(Listing.timestamp.desc()).limit(limit).all()
|
||
output = StringIO()
|
||
writer = csv.writer(output)
|
||
writer.writerow(["ID","Title","Price","Currency","Price Raw","Time Left","Score",
|
||
"Keyword","Site","Link","Captured At","Price Updated At"])
|
||
for r in rows:
|
||
writer.writerow([
|
||
r.id, r.title, r.price or "", r.currency or "", r.price_raw or "",
|
||
r.time_left or "", r.score, r.keyword or "", r.site_name or "",
|
||
r.link, r.timestamp.isoformat() if r.timestamp else "",
|
||
r.price_updated_at.isoformat() if r.price_updated_at else "",
|
||
])
|
||
output.seek(0)
|
||
return StreamingResponse(
|
||
iter([output.getvalue()]),
|
||
media_type="text/csv",
|
||
headers={"Content-Disposition": "attachment; filename=ghost_node_listings.csv"},
|
||
)
|
||
|
||
|
||
@app.get("/api/export/json")
|
||
def export_json(limit: int = 10000, db: Session = Depends(get_db)):
|
||
"""Export all listings to a JSON file download."""
|
||
rows = db.query(Listing).order_by(Listing.timestamp.desc()).limit(limit).all()
|
||
data = _json.dumps([r.to_dict() for r in rows], indent=2, default=str)
|
||
return StreamingResponse(
|
||
iter([data]),
|
||
media_type="application/json",
|
||
headers={"Content-Disposition": "attachment; filename=ghost_node_listings.json"},
|
||
)
|
||
|
||
|
||
@app.get("/api/export/html")
|
||
def export_html(limit: int = 10000, db: Session = Depends(get_db)):
|
||
"""Export all listings as a self-contained HTML report."""
|
||
rows = db.query(Listing).order_by(Listing.timestamp.desc()).limit(limit).all()
|
||
now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||
rows_html = ""
|
||
for r in rows:
|
||
score_color = "#00ff88" if r.score > 0 else "#888"
|
||
ts_str = r.timestamp.strftime("%Y-%m-%d %H:%M") if r.timestamp else ""
|
||
rows_html += (
|
||
"<tr>"
|
||
f'<td><a href="{r.link}" target="_blank">{r.title[:80]}</a></td>'
|
||
f'<td>{r.price_raw or ""}</td>'
|
||
f'<td>{r.time_left or ""}</td>'
|
||
f'<td style="color:{score_color}">{r.score}</td>'
|
||
f'<td>{r.keyword or ""}</td>'
|
||
f'<td>{r.site_name or ""}</td>'
|
||
f'<td style="font-size:11px;color:#888">{ts_str}</td>'
|
||
"</tr>\n"
|
||
)
|
||
html = f"""<!DOCTYPE html>
|
||
<html lang="en">
|
||
<head>
|
||
<meta charset="UTF-8">
|
||
<title>Ghost Node Export — {now}</title>
|
||
<style>
|
||
body{{font-family:monospace;background:#0d0d1a;color:#c8d3f0;padding:24px}}
|
||
h1{{color:#00f5ff;letter-spacing:3px}}
|
||
table{{width:100%;border-collapse:collapse;margin-top:16px}}
|
||
th{{background:#1a1a2e;color:#00f5ff;padding:8px 12px;text-align:left;border-bottom:1px solid #333}}
|
||
td{{padding:6px 12px;border-bottom:1px solid #1a1a2e;font-size:13px}}
|
||
tr:hover{{background:#1a1a2e}}
|
||
a{{color:#00f5ff;text-decoration:none}}
|
||
a:hover{{text-decoration:underline}}
|
||
.meta{{color:#888;font-size:12px;margin-bottom:12px}}
|
||
</style>
|
||
</head>
|
||
<body>
|
||
<h1>// GHOST NODE LISTINGS EXPORT</h1>
|
||
<div class="meta">Generated: {now} | {len(rows)} listings</div>
|
||
<table>
|
||
<thead><tr><th>Title</th><th>Price</th><th>Time Left</th><th>Score</th><th>Keyword</th><th>Site</th><th>Captured</th></tr></thead>
|
||
<tbody>
|
||
{rows_html}
|
||
</tbody>
|
||
</table>
|
||
</body>
|
||
</html>"""
|
||
return StreamingResponse(
|
||
iter([html]),
|
||
media_type="text/html",
|
||
headers={"Content-Disposition": f"attachment; filename=ghost_node_export_{now[:10]}.html"},
|
||
)
|
||
|
||
|
||
# ── Database Backup & Restore ────────────────────────────────────────────────
|
||
|
||
@app.get("/api/backup/download")
|
||
def backup_download():
|
||
"""
|
||
Stream the raw sniper.db SQLite file as a download.
|
||
Only works when using SQLite (not PostgreSQL).
|
||
Creates a timestamped filename so backups don't overwrite each other.
|
||
"""
|
||
from database import DATABASE_URL, _is_sqlite
|
||
if not _is_sqlite:
|
||
return JSONResponse(
|
||
{"error": "Backup only supported for SQLite. Use pg_dump for PostgreSQL."},
|
||
status_code=400,
|
||
)
|
||
# Resolve the actual file path from the SQLite URL
|
||
db_path = DATABASE_URL.replace("sqlite:///", "").replace("sqlite://", "")
|
||
if not db_path.startswith("/"):
|
||
db_path = os.path.join(os.path.dirname(__file__), db_path.lstrip("./"))
|
||
db_path = os.path.abspath(db_path)
|
||
|
||
if not os.path.exists(db_path):
|
||
return JSONResponse({"error": f"Database file not found: {db_path}"}, status_code=404)
|
||
|
||
# Use a safe hot-backup: VACUUM INTO a temp file, then stream it
|
||
# This avoids streaming a live WAL-mode DB mid-write
|
||
import tempfile, shutil
|
||
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||
backup_filename = f"ghost_node_backup_{ts}.db"
|
||
tmp_path = os.path.join(tempfile.gettempdir(), backup_filename)
|
||
try:
|
||
import sqlite3 as _sqlite3
|
||
conn = _sqlite3.connect(db_path)
|
||
bk = _sqlite3.connect(tmp_path)
|
||
conn.backup(bk)
|
||
bk.close()
|
||
conn.close()
|
||
except Exception as exc:
|
||
return JSONResponse({"error": f"Backup failed: {exc}"}, status_code=500)
|
||
|
||
def _stream_and_delete():
|
||
try:
|
||
with open(tmp_path, "rb") as f:
|
||
while chunk := f.read(65536):
|
||
yield chunk
|
||
finally:
|
||
try:
|
||
os.remove(tmp_path)
|
||
except Exception:
|
||
pass
|
||
|
||
print(f"[Backup] 📦 Streaming backup: {backup_filename} ({os.path.getsize(tmp_path):,} bytes)")
|
||
return StreamingResponse(
|
||
_stream_and_delete(),
|
||
media_type="application/octet-stream",
|
||
headers={"Content-Disposition": f"attachment; filename={backup_filename}"},
|
||
)
|
||
|
||
|
||
@app.post("/api/backup/restore")
|
||
async def backup_restore(request: Request):
|
||
"""
|
||
Accept a .db file upload and replace the current sniper.db with it.
|
||
The server restarts automatically after restore so all connections reopen.
|
||
SAFETY: saves the current DB as an auto-backup before overwriting.
|
||
Only works when using SQLite.
|
||
"""
|
||
from database import DATABASE_URL, _is_sqlite
|
||
if not _is_sqlite:
|
||
return JSONResponse(
|
||
{"error": "Restore only supported for SQLite."},
|
||
status_code=400,
|
||
)
|
||
|
||
db_path = DATABASE_URL.replace("sqlite:///", "").replace("sqlite://", "")
|
||
if not db_path.startswith("/"):
|
||
db_path = os.path.join(os.path.dirname(__file__), db_path.lstrip("./"))
|
||
db_path = os.path.abspath(db_path)
|
||
|
||
try:
|
||
body = await request.body()
|
||
if len(body) < 100:
|
||
return JSONResponse({"error": "Uploaded file appears empty or too small."}, status_code=400)
|
||
|
||
# Verify it's a valid SQLite file (magic bytes: "SQLite format 3")
|
||
if not body[:16].startswith(b"SQLite format 3"):
|
||
return JSONResponse(
|
||
{"error": "File does not appear to be a valid SQLite database."},
|
||
status_code=400,
|
||
)
|
||
|
||
# Auto-backup current DB before overwriting
|
||
if os.path.exists(db_path):
|
||
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||
auto_bk = db_path + f".pre_restore_{ts}.bak"
|
||
import shutil
|
||
shutil.copy2(db_path, auto_bk)
|
||
print(f"[Restore] 💾 Auto-backup saved: {auto_bk}")
|
||
|
||
# Write the uploaded file
|
||
with open(db_path, "wb") as f:
|
||
f.write(body)
|
||
|
||
print(f"[Restore] ✅ Database restored from upload ({len(body):,} bytes). Restarting...")
|
||
|
||
# Restart the process to reopen all DB connections
|
||
import threading
|
||
def _restart():
|
||
import time, sys
|
||
time.sleep(1)
|
||
os.execv(sys.executable, [sys.executable] + sys.argv)
|
||
threading.Thread(target=_restart, daemon=True).start()
|
||
|
||
return JSONResponse({
|
||
"status": "restored",
|
||
"message": "Database restored successfully. Ghost Node is restarting — refresh the dashboard in 5 seconds.",
|
||
"bytes_written": len(body),
|
||
})
|
||
|
||
except Exception as exc:
|
||
return JSONResponse({"error": f"Restore failed: {exc}"}, status_code=500)
|
||
|
||
|
||
@app.get("/api/debug/db")
|
||
def debug_db(db: Session = Depends(get_db)):
|
||
"""
|
||
Returns the exact contents of Config and TargetSite tables.
|
||
Use this to confirm that Settings-tab saves and new sites
|
||
are genuinely written to sniper.db.
|
||
"""
|
||
configs = {r.key: r.value for r in db.query(Config).all()}
|
||
# Mask token for security — show only first 8 chars
|
||
if "telegram_token" in configs and configs["telegram_token"]:
|
||
t = configs["telegram_token"]
|
||
configs["telegram_token"] = t[:8] + "…" if len(t) > 8 else t
|
||
sites = [s.to_dict() for s in db.query(TargetSite).all()]
|
||
keywords = [k.to_dict() for k in db.query(Keyword).all()]
|
||
return {
|
||
"config": configs,
|
||
"sites": sites,
|
||
"keywords": keywords,
|
||
"listing_count": db.query(Listing).count(),
|
||
}
|
||
|
||
|
||
# ── Phase 7: Serve Next.js static build ─────────────────────────────────────
|
||
import pathlib as _pathlib
|
||
_frontend_out = _pathlib.Path(__file__).parent / "frontend" / "out"
|
||
if _frontend_out.exists():
|
||
from fastapi.staticfiles import StaticFiles
|
||
from fastapi.responses import FileResponse as _FileResponse
|
||
|
||
# Mount ONLY the _next directory (JS/CSS/image assets).
|
||
# We deliberately avoid app.mount("/", html=True) because it intercepts
|
||
# ALL paths as a Starlette sub-app, shadowing explicit routes like /legacy.
|
||
_next_dir = _frontend_out / "_next"
|
||
if _next_dir.exists():
|
||
app.mount("/_next", StaticFiles(directory=str(_next_dir)), name="nextjs_assets")
|
||
|
||
# SPA catch-all — registered last so all specific @app.get() routes win.
|
||
# Handles: exact files (favicon.ico, etc.), Next.js .html pages, and the
|
||
# SPA index.html fallback for deep-linked client-side routes.
|
||
@app.get("/{full_path:path}")
|
||
async def serve_spa(full_path: str):
|
||
# 1. Exact file match (favicon.ico, *.svg, etc.)
|
||
candidate = _frontend_out / full_path
|
||
if candidate.is_file():
|
||
return _FileResponse(str(candidate))
|
||
# 2. Next.js exported page (e.g. "dashboard" → dashboard.html)
|
||
html_candidate = _frontend_out / f"{full_path}.html"
|
||
if html_candidate.is_file():
|
||
return _FileResponse(str(html_candidate))
|
||
# 3. SPA fallback — let the client-side router handle it
|
||
return _FileResponse(str(_frontend_out / "index.html"))
|
||
|
||
print("[GhostNode] Serving Next.js frontend from frontend/out/")
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Entry Point — spin up threads
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
if __name__ == "__main__":
|
||
# Thread B (scraper & Telegram C2 share one asyncio event loop)
|
||
scraper_thread = threading.Thread(
|
||
target=run_scraper_thread,
|
||
name="GhostNode-Scraper",
|
||
daemon=True,
|
||
)
|
||
scraper_thread.start()
|
||
|
||
# Thread D (price/time-left refresh — isolated event loop, never blocks scraper)
|
||
refresh_thread = threading.Thread(
|
||
target=run_refresh_thread,
|
||
name="GhostNode-Refresh",
|
||
daemon=True,
|
||
)
|
||
refresh_thread.start()
|
||
|
||
# Thread E (closing-soon alert — isolated event loop, polls every 60s)
|
||
closing_thread = threading.Thread(
|
||
target=run_closing_alert_thread,
|
||
name="GhostNode-ClosingAlert",
|
||
daemon=True,
|
||
)
|
||
closing_thread.start()
|
||
|
||
print("[GhostNode] 🕵️ Ghost Node online — Dashboard → http://localhost:8000")
|
||
|
||
# Thread A (FastAPI via uvicorn — blocks main thread)
|
||
uvicorn.run(app, host="0.0.0.0", port=8000, log_level="warning")
|