diff --git a/assets/pasted-20260419-201602-db15dcc6.png b/assets/pasted-20260419-201602-db15dcc6.png new file mode 100644 index 0000000..3d5dd90 Binary files /dev/null and b/assets/pasted-20260419-201602-db15dcc6.png differ diff --git a/index-en.php b/index-en.php index 30927b6..0e04bd3 100644 --- a/index-en.php +++ b/index-en.php @@ -84,34 +84,98 @@ function index_vanilla_uex_normalize_search_text(string $value): string return trim((string) preg_replace('/\s+/u', ' ', $value)); } -function index_vanilla_uex_title_matches_query(string $title, string $queryName): bool +function index_vanilla_uex_normalize_compact_search_text(string $value): string { - $normalizedTitle = index_vanilla_uex_normalize_search_text($title); - $normalizedQuery = index_vanilla_uex_normalize_search_text($queryName); + $value = function_exists('mb_strtolower') + ? mb_strtolower($value, 'UTF-8') + : strtolower($value); - if ($normalizedTitle === '' || $normalizedQuery === '') { - return false; + return trim((string) preg_replace('/[^[:alnum:]]+/u', '', $value)); +} + +function index_vanilla_uex_tokenize_search_text(string $value): array +{ + $normalizedValue = index_vanilla_uex_normalize_search_text($value); + if ($normalizedValue === '') { + return []; } - if (strpos($normalizedTitle, $normalizedQuery) !== false) { - return true; - } - - $queryTokens = array_values(array_filter(explode(' ', $normalizedQuery), static function (string $token): bool { - return preg_match('/\d/', $token) || strlen($token) >= 3; + $tokens = array_values(array_filter(explode(' ', $normalizedValue), static function (string $token): bool { + return preg_match('/\d/u', $token) || strlen($token) >= 2; })); + return array_values(array_unique($tokens)); +} + +function index_vanilla_uex_extract_wts_subject(string $title): string +{ + $normalizedTitle = index_vanilla_uex_normalize_search_text($title); + if ($normalizedTitle === '') { + return ''; + } + + if (!preg_match('/\bwts\b/u', $normalizedTitle, $matches, PREG_OFFSET_CAPTURE)) { + return ''; + } + + $matchText = (string) ($matches[0][0] ?? 'wts'); + $matchOffset = (int) ($matches[0][1] ?? 0); + $subject = trim(substr($normalizedTitle, $matchOffset + strlen($matchText))); + + while ($subject !== '') { + $updated = trim((string) preg_replace('/^(?:wts|wtb|lts|ltb|sell|selling|trade|trading|for|offer|offering|looking|lf|want|wanted)\b[\s\-:]*/u', '', $subject)); + if ($updated === $subject) { + break; + } + + $subject = $updated; + } + + $subject = trim((string) preg_replace('/^(?:[a-z]{1,4}\s+)?\d+(?:\s+\d+)*(?:\s*[\-:])\s*/u', '', $subject)); + return $subject; +} + +function index_vanilla_uex_title_has_wts_marker(string $title): bool +{ + return index_vanilla_uex_extract_wts_subject($title) !== ''; +} + +function index_vanilla_uex_title_match_score(string $title, string $queryName): int +{ + $normalizedTitle = index_vanilla_uex_extract_wts_subject($title); + $normalizedQuery = index_vanilla_uex_normalize_search_text($queryName); + $compactTitle = index_vanilla_uex_normalize_compact_search_text($normalizedTitle); + $compactQuery = index_vanilla_uex_normalize_compact_search_text($normalizedQuery); + + if ($normalizedTitle === '' || $normalizedQuery === '' || $compactTitle === '' || $compactQuery === '') { + return 0; + } + + if ($normalizedTitle === $normalizedQuery || $compactTitle === $compactQuery) { + return 3; + } + + if (strpos($normalizedTitle, $normalizedQuery) !== false || strpos($compactTitle, $compactQuery) !== false) { + return 2; + } + + $queryTokens = index_vanilla_uex_tokenize_search_text($normalizedQuery); if ($queryTokens === []) { - return false; + return 0; } foreach ($queryTokens as $token) { if (strpos($normalizedTitle, $token) === false) { - return false; + return 0; } } - return true; + return 1; +} + +function index_vanilla_uex_title_matches_query(string $title, string $queryName): bool +{ + return index_vanilla_uex_title_match_score($title, $queryName) > 0; } function index_vanilla_uex_extract_price_value(string $rawPrice): ?int @@ -168,20 +232,21 @@ function index_vanilla_uex_extract_price_value(string $rawPrice): ?int function index_vanilla_uex_parse_estimate_from_html(string $html, string $queryName, int $sampleLimit = 10): array { - $values = []; + $candidates = []; $chunks = preg_split('/]*>/i', $html) ?: []; foreach ($chunks as $chunk) { - if (count($values) >= $sampleLimit) { - break; - } - if (!preg_match('/]*class="text-bold"[^>]*>(.*?)<\/a>/is', $chunk, $titleMatches)) { continue; } $title = index_vanilla_uex_normalize_whitespace((string) ($titleMatches[1] ?? '')); - if ($title === '' || !preg_match('/^WTS\b/i', $title) || !index_vanilla_uex_title_matches_query($title, $queryName)) { + if ($title === '' || !index_vanilla_uex_title_has_wts_marker($title)) { + continue; + } + + $matchScore = index_vanilla_uex_title_match_score($title, $queryName); + if ($matchScore <= 0) { continue; } @@ -194,7 +259,34 @@ function index_vanilla_uex_parse_estimate_from_html(string $html, string $queryN continue; } - $values[] = $priceValue; + $candidates[] = [ + 'price' => $priceValue, + 'score' => $matchScore, + ]; + } + + if ($candidates === []) { + return [ + 'has_estimate' => false, + 'average' => null, + 'formatted' => '—', + 'sample_count' => 0, + ]; + } + + $bestScore = max(array_column($candidates, 'score')); + $minimumAcceptedScore = $bestScore >= 3 ? 2 : $bestScore; + $values = []; + + foreach ($candidates as $candidate) { + if (($candidate['score'] ?? 0) < $minimumAcceptedScore) { + continue; + } + + $values[] = (int) $candidate['price']; + if (count($values) >= $sampleLimit) { + break; + } } if ($values === []) { @@ -216,6 +308,44 @@ function index_vanilla_uex_parse_estimate_from_html(string $html, string $queryN ]; } +function index_vanilla_uex_create_handle(string $url, string $userAgent, int $connectTimeout, int $timeout) +{ + $handle = curl_init(); + curl_setopt_array($handle, [ + CURLOPT_URL => $url, + CURLOPT_RETURNTRANSFER => true, + CURLOPT_FOLLOWLOCATION => true, + CURLOPT_CONNECTTIMEOUT => $connectTimeout, + CURLOPT_TIMEOUT => $timeout, + CURLOPT_USERAGENT => $userAgent, + CURLOPT_ENCODING => '', + CURLOPT_SSL_VERIFYPEER => true, + CURLOPT_SSL_VERIFYHOST => 2, + CURLOPT_HTTPHEADER => [ + 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Language: fr-FR,fr;q=0.9,en;q=0.8', + 'Cache-Control: no-cache', + ], + ]); + + return $handle; +} + +function index_vanilla_uex_fetch_single_body(string $url, string $userAgent, int $connectTimeout = 8, int $timeout = 18): ?string +{ + $handle = index_vanilla_uex_create_handle($url, $userAgent, $connectTimeout, $timeout); + $body = curl_exec($handle); + $error = curl_error($handle); + $httpCode = (int) curl_getinfo($handle, CURLINFO_RESPONSE_CODE); + curl_close($handle); + + if (!is_string($body) || $error !== '' || $httpCode < 200 || $httpCode >= 300 || trim($body) === '') { + return null; + } + + return $body; +} + function index_vanilla_uex_fetch_estimates(array $names, int $sampleLimit = 10): array { $results = []; @@ -234,65 +364,63 @@ function index_vanilla_uex_fetch_estimates(array $names, int $sampleLimit = 10): return $results; } - $multiHandle = curl_multi_init(); - $handles = []; $userAgent = 'Mozilla/5.0 (compatible; FlatLogicVanillaDb/1.0; +https://uexcorp.space/)'; + $batchSize = 4; - foreach ($uniqueNames as $name) { - $url = 'https://uexcorp.space/search?q=' . rawurlencode($name); - $handle = curl_init(); - curl_setopt_array($handle, [ - CURLOPT_URL => $url, - CURLOPT_RETURNTRANSFER => true, - CURLOPT_FOLLOWLOCATION => true, - CURLOPT_CONNECTTIMEOUT => 4, - CURLOPT_TIMEOUT => 8, - CURLOPT_USERAGENT => $userAgent, - CURLOPT_SSL_VERIFYPEER => true, - CURLOPT_SSL_VERIFYHOST => 2, - CURLOPT_HTTPHEADER => [ - 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', - 'Accept-Language: fr-FR,fr;q=0.9,en;q=0.8', - 'Cache-Control: no-cache', - ], - ]); + foreach (array_chunk(array_values($uniqueNames), $batchSize) as $nameBatch) { + $multiHandle = curl_multi_init(); + $handles = []; - curl_multi_add_handle($multiHandle, $handle); - $handles[$name] = $handle; - } - - $running = null; - do { - $status = curl_multi_exec($multiHandle, $running); - if ($running) { - curl_multi_select($multiHandle, 1.0); - } - } while ($running && $status === CURLM_OK); - - foreach ($handles as $name => $handle) { - $error = curl_error($handle); - $httpCode = (int) curl_getinfo($handle, CURLINFO_RESPONSE_CODE); - $body = (string) curl_multi_getcontent($handle); - - if ($error !== '' || $httpCode < 200 || $httpCode >= 300 || trim($body) === '') { - $results[$name] = [ - 'has_estimate' => false, - 'average' => null, - 'formatted' => 'Indisponible', - 'sample_count' => 0, - 'error' => true, + foreach ($nameBatch as $name) { + $url = 'https://uexcorp.space/search?q=' . rawurlencode($name); + $handle = index_vanilla_uex_create_handle($url, $userAgent, 6, 12); + curl_multi_add_handle($multiHandle, $handle); + $handles[$name] = [ + 'handle' => $handle, + 'url' => $url, ]; - } else { + } + + $running = null; + do { + $status = curl_multi_exec($multiHandle, $running); + if ($running) { + curl_multi_select($multiHandle, 1.0); + } + } while ($running && $status === CURLM_OK); + + foreach ($handles as $name => $payload) { + $handle = $payload['handle']; + $url = $payload['url']; + $error = curl_error($handle); + $httpCode = (int) curl_getinfo($handle, CURLINFO_RESPONSE_CODE); + $body = (string) curl_multi_getcontent($handle); + + curl_multi_remove_handle($multiHandle, $handle); + curl_close($handle); + + if ($error !== '' || $httpCode < 200 || $httpCode >= 300 || trim($body) === '') { + $body = index_vanilla_uex_fetch_single_body($url, $userAgent) ?? ''; + } + + if (trim($body) === '') { + $results[$name] = [ + 'has_estimate' => false, + 'average' => null, + 'formatted' => 'Indisponible', + 'sample_count' => 0, + 'error' => true, + ]; + continue; + } + $results[$name] = index_vanilla_uex_parse_estimate_from_html($body, $name, $sampleLimit); $results[$name]['error'] = false; } - curl_multi_remove_handle($multiHandle, $handle); - curl_close($handle); + curl_multi_close($multiHandle); } - curl_multi_close($multiHandle); - return $results; } diff --git a/index.php b/index.php index 67e8533..90c0b78 100644 --- a/index.php +++ b/index.php @@ -85,34 +85,98 @@ function index_vanilla_uex_normalize_search_text(string $value): string return trim((string) preg_replace('/\s+/u', ' ', $value)); } -function index_vanilla_uex_title_matches_query(string $title, string $queryName): bool +function index_vanilla_uex_normalize_compact_search_text(string $value): string { - $normalizedTitle = index_vanilla_uex_normalize_search_text($title); - $normalizedQuery = index_vanilla_uex_normalize_search_text($queryName); + $value = function_exists('mb_strtolower') + ? mb_strtolower($value, 'UTF-8') + : strtolower($value); - if ($normalizedTitle === '' || $normalizedQuery === '') { - return false; + return trim((string) preg_replace('/[^[:alnum:]]+/u', '', $value)); +} + +function index_vanilla_uex_tokenize_search_text(string $value): array +{ + $normalizedValue = index_vanilla_uex_normalize_search_text($value); + if ($normalizedValue === '') { + return []; } - if (strpos($normalizedTitle, $normalizedQuery) !== false) { - return true; - } - - $queryTokens = array_values(array_filter(explode(' ', $normalizedQuery), static function (string $token): bool { - return preg_match('/\d/', $token) || strlen($token) >= 3; + $tokens = array_values(array_filter(explode(' ', $normalizedValue), static function (string $token): bool { + return preg_match('/\d/u', $token) || strlen($token) >= 2; })); + return array_values(array_unique($tokens)); +} + +function index_vanilla_uex_extract_wts_subject(string $title): string +{ + $normalizedTitle = index_vanilla_uex_normalize_search_text($title); + if ($normalizedTitle === '') { + return ''; + } + + if (!preg_match('/\bwts\b/u', $normalizedTitle, $matches, PREG_OFFSET_CAPTURE)) { + return ''; + } + + $matchText = (string) ($matches[0][0] ?? 'wts'); + $matchOffset = (int) ($matches[0][1] ?? 0); + $subject = trim(substr($normalizedTitle, $matchOffset + strlen($matchText))); + + while ($subject !== '') { + $updated = trim((string) preg_replace('/^(?:wts|wtb|lts|ltb|sell|selling|trade|trading|for|offer|offering|looking|lf|want|wanted)\b[\s\-:]*/u', '', $subject)); + if ($updated === $subject) { + break; + } + + $subject = $updated; + } + + $subject = trim((string) preg_replace('/^(?:[a-z]{1,4}\s+)?\d+(?:\s+\d+)*(?:\s*[\-:])\s*/u', '', $subject)); + return $subject; +} + +function index_vanilla_uex_title_has_wts_marker(string $title): bool +{ + return index_vanilla_uex_extract_wts_subject($title) !== ''; +} + +function index_vanilla_uex_title_match_score(string $title, string $queryName): int +{ + $normalizedTitle = index_vanilla_uex_extract_wts_subject($title); + $normalizedQuery = index_vanilla_uex_normalize_search_text($queryName); + $compactTitle = index_vanilla_uex_normalize_compact_search_text($normalizedTitle); + $compactQuery = index_vanilla_uex_normalize_compact_search_text($normalizedQuery); + + if ($normalizedTitle === '' || $normalizedQuery === '' || $compactTitle === '' || $compactQuery === '') { + return 0; + } + + if ($normalizedTitle === $normalizedQuery || $compactTitle === $compactQuery) { + return 3; + } + + if (strpos($normalizedTitle, $normalizedQuery) !== false || strpos($compactTitle, $compactQuery) !== false) { + return 2; + } + + $queryTokens = index_vanilla_uex_tokenize_search_text($normalizedQuery); if ($queryTokens === []) { - return false; + return 0; } foreach ($queryTokens as $token) { if (strpos($normalizedTitle, $token) === false) { - return false; + return 0; } } - return true; + return 1; +} + +function index_vanilla_uex_title_matches_query(string $title, string $queryName): bool +{ + return index_vanilla_uex_title_match_score($title, $queryName) > 0; } function index_vanilla_uex_extract_price_value(string $rawPrice): ?int @@ -169,20 +233,21 @@ function index_vanilla_uex_extract_price_value(string $rawPrice): ?int function index_vanilla_uex_parse_estimate_from_html(string $html, string $queryName, int $sampleLimit = 10): array { - $values = []; + $candidates = []; $chunks = preg_split('/]*>/i', $html) ?: []; foreach ($chunks as $chunk) { - if (count($values) >= $sampleLimit) { - break; - } - if (!preg_match('/]*class="text-bold"[^>]*>(.*?)<\/a>/is', $chunk, $titleMatches)) { continue; } $title = index_vanilla_uex_normalize_whitespace((string) ($titleMatches[1] ?? '')); - if ($title === '' || !preg_match('/^WTS\b/i', $title) || !index_vanilla_uex_title_matches_query($title, $queryName)) { + if ($title === '' || !index_vanilla_uex_title_has_wts_marker($title)) { + continue; + } + + $matchScore = index_vanilla_uex_title_match_score($title, $queryName); + if ($matchScore <= 0) { continue; } @@ -195,7 +260,34 @@ function index_vanilla_uex_parse_estimate_from_html(string $html, string $queryN continue; } - $values[] = $priceValue; + $candidates[] = [ + 'price' => $priceValue, + 'score' => $matchScore, + ]; + } + + if ($candidates === []) { + return [ + 'has_estimate' => false, + 'average' => null, + 'formatted' => '—', + 'sample_count' => 0, + ]; + } + + $bestScore = max(array_column($candidates, 'score')); + $minimumAcceptedScore = $bestScore >= 3 ? 2 : $bestScore; + $values = []; + + foreach ($candidates as $candidate) { + if (($candidate['score'] ?? 0) < $minimumAcceptedScore) { + continue; + } + + $values[] = (int) $candidate['price']; + if (count($values) >= $sampleLimit) { + break; + } } if ($values === []) { @@ -217,6 +309,44 @@ function index_vanilla_uex_parse_estimate_from_html(string $html, string $queryN ]; } +function index_vanilla_uex_create_handle(string $url, string $userAgent, int $connectTimeout, int $timeout) +{ + $handle = curl_init(); + curl_setopt_array($handle, [ + CURLOPT_URL => $url, + CURLOPT_RETURNTRANSFER => true, + CURLOPT_FOLLOWLOCATION => true, + CURLOPT_CONNECTTIMEOUT => $connectTimeout, + CURLOPT_TIMEOUT => $timeout, + CURLOPT_USERAGENT => $userAgent, + CURLOPT_ENCODING => '', + CURLOPT_SSL_VERIFYPEER => true, + CURLOPT_SSL_VERIFYHOST => 2, + CURLOPT_HTTPHEADER => [ + 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Language: fr-FR,fr;q=0.9,en;q=0.8', + 'Cache-Control: no-cache', + ], + ]); + + return $handle; +} + +function index_vanilla_uex_fetch_single_body(string $url, string $userAgent, int $connectTimeout = 8, int $timeout = 18): ?string +{ + $handle = index_vanilla_uex_create_handle($url, $userAgent, $connectTimeout, $timeout); + $body = curl_exec($handle); + $error = curl_error($handle); + $httpCode = (int) curl_getinfo($handle, CURLINFO_RESPONSE_CODE); + curl_close($handle); + + if (!is_string($body) || $error !== '' || $httpCode < 200 || $httpCode >= 300 || trim($body) === '') { + return null; + } + + return $body; +} + function index_vanilla_uex_fetch_estimates(array $names, int $sampleLimit = 10): array { $results = []; @@ -235,65 +365,63 @@ function index_vanilla_uex_fetch_estimates(array $names, int $sampleLimit = 10): return $results; } - $multiHandle = curl_multi_init(); - $handles = []; $userAgent = 'Mozilla/5.0 (compatible; FlatLogicVanillaDb/1.0; +https://uexcorp.space/)'; + $batchSize = 4; - foreach ($uniqueNames as $name) { - $url = 'https://uexcorp.space/search?q=' . rawurlencode($name); - $handle = curl_init(); - curl_setopt_array($handle, [ - CURLOPT_URL => $url, - CURLOPT_RETURNTRANSFER => true, - CURLOPT_FOLLOWLOCATION => true, - CURLOPT_CONNECTTIMEOUT => 4, - CURLOPT_TIMEOUT => 8, - CURLOPT_USERAGENT => $userAgent, - CURLOPT_SSL_VERIFYPEER => true, - CURLOPT_SSL_VERIFYHOST => 2, - CURLOPT_HTTPHEADER => [ - 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', - 'Accept-Language: fr-FR,fr;q=0.9,en;q=0.8', - 'Cache-Control: no-cache', - ], - ]); + foreach (array_chunk(array_values($uniqueNames), $batchSize) as $nameBatch) { + $multiHandle = curl_multi_init(); + $handles = []; - curl_multi_add_handle($multiHandle, $handle); - $handles[$name] = $handle; - } - - $running = null; - do { - $status = curl_multi_exec($multiHandle, $running); - if ($running) { - curl_multi_select($multiHandle, 1.0); - } - } while ($running && $status === CURLM_OK); - - foreach ($handles as $name => $handle) { - $error = curl_error($handle); - $httpCode = (int) curl_getinfo($handle, CURLINFO_RESPONSE_CODE); - $body = (string) curl_multi_getcontent($handle); - - if ($error !== '' || $httpCode < 200 || $httpCode >= 300 || trim($body) === '') { - $results[$name] = [ - 'has_estimate' => false, - 'average' => null, - 'formatted' => 'Indisponible', - 'sample_count' => 0, - 'error' => true, + foreach ($nameBatch as $name) { + $url = 'https://uexcorp.space/search?q=' . rawurlencode($name); + $handle = index_vanilla_uex_create_handle($url, $userAgent, 6, 12); + curl_multi_add_handle($multiHandle, $handle); + $handles[$name] = [ + 'handle' => $handle, + 'url' => $url, ]; - } else { + } + + $running = null; + do { + $status = curl_multi_exec($multiHandle, $running); + if ($running) { + curl_multi_select($multiHandle, 1.0); + } + } while ($running && $status === CURLM_OK); + + foreach ($handles as $name => $payload) { + $handle = $payload['handle']; + $url = $payload['url']; + $error = curl_error($handle); + $httpCode = (int) curl_getinfo($handle, CURLINFO_RESPONSE_CODE); + $body = (string) curl_multi_getcontent($handle); + + curl_multi_remove_handle($multiHandle, $handle); + curl_close($handle); + + if ($error !== '' || $httpCode < 200 || $httpCode >= 300 || trim($body) === '') { + $body = index_vanilla_uex_fetch_single_body($url, $userAgent) ?? ''; + } + + if (trim($body) === '') { + $results[$name] = [ + 'has_estimate' => false, + 'average' => null, + 'formatted' => 'Indisponible', + 'sample_count' => 0, + 'error' => true, + ]; + continue; + } + $results[$name] = index_vanilla_uex_parse_estimate_from_html($body, $name, $sampleLimit); $results[$name]['error'] = false; } - curl_multi_remove_handle($multiHandle, $handle); - curl_close($handle); + curl_multi_close($multiHandle); } - curl_multi_close($multiHandle); - return $results; }