]*>/i', $html) ?: [];
foreach ($chunks as $chunk) {
- if (count($values) >= $sampleLimit) {
- break;
- }
-
if (!preg_match('/
]*class="text-bold"[^>]*>(.*?)<\/a>/is', $chunk, $titleMatches)) {
continue;
}
$title = index_vanilla_uex_normalize_whitespace((string) ($titleMatches[1] ?? ''));
- if ($title === '' || !preg_match('/^WTS\b/i', $title) || !index_vanilla_uex_title_matches_query($title, $queryName)) {
+ if ($title === '' || !index_vanilla_uex_title_has_wts_marker($title)) {
+ continue;
+ }
+
+ $matchScore = index_vanilla_uex_title_match_score($title, $queryName);
+ if ($matchScore <= 0) {
continue;
}
@@ -194,7 +259,34 @@ function index_vanilla_uex_parse_estimate_from_html(string $html, string $queryN
continue;
}
- $values[] = $priceValue;
+ $candidates[] = [
+ 'price' => $priceValue,
+ 'score' => $matchScore,
+ ];
+ }
+
+ if ($candidates === []) {
+ return [
+ 'has_estimate' => false,
+ 'average' => null,
+ 'formatted' => '—',
+ 'sample_count' => 0,
+ ];
+ }
+
+ $bestScore = max(array_column($candidates, 'score'));
+ $minimumAcceptedScore = $bestScore >= 3 ? 2 : $bestScore;
+ $values = [];
+
+ foreach ($candidates as $candidate) {
+ if (($candidate['score'] ?? 0) < $minimumAcceptedScore) {
+ continue;
+ }
+
+ $values[] = (int) $candidate['price'];
+ if (count($values) >= $sampleLimit) {
+ break;
+ }
}
if ($values === []) {
@@ -216,6 +308,44 @@ function index_vanilla_uex_parse_estimate_from_html(string $html, string $queryN
];
}
+function index_vanilla_uex_create_handle(string $url, string $userAgent, int $connectTimeout, int $timeout)
+{
+ $handle = curl_init();
+ curl_setopt_array($handle, [
+ CURLOPT_URL => $url,
+ CURLOPT_RETURNTRANSFER => true,
+ CURLOPT_FOLLOWLOCATION => true,
+ CURLOPT_CONNECTTIMEOUT => $connectTimeout,
+ CURLOPT_TIMEOUT => $timeout,
+ CURLOPT_USERAGENT => $userAgent,
+ CURLOPT_ENCODING => '',
+ CURLOPT_SSL_VERIFYPEER => true,
+ CURLOPT_SSL_VERIFYHOST => 2,
+ CURLOPT_HTTPHEADER => [
+ 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+ 'Accept-Language: fr-FR,fr;q=0.9,en;q=0.8',
+ 'Cache-Control: no-cache',
+ ],
+ ]);
+
+ return $handle;
+}
+
+function index_vanilla_uex_fetch_single_body(string $url, string $userAgent, int $connectTimeout = 8, int $timeout = 18): ?string
+{
+ $handle = index_vanilla_uex_create_handle($url, $userAgent, $connectTimeout, $timeout);
+ $body = curl_exec($handle);
+ $error = curl_error($handle);
+ $httpCode = (int) curl_getinfo($handle, CURLINFO_RESPONSE_CODE);
+ curl_close($handle);
+
+ if (!is_string($body) || $error !== '' || $httpCode < 200 || $httpCode >= 300 || trim($body) === '') {
+ return null;
+ }
+
+ return $body;
+}
+
function index_vanilla_uex_fetch_estimates(array $names, int $sampleLimit = 10): array
{
$results = [];
@@ -234,65 +364,63 @@ function index_vanilla_uex_fetch_estimates(array $names, int $sampleLimit = 10):
return $results;
}
- $multiHandle = curl_multi_init();
- $handles = [];
$userAgent = 'Mozilla/5.0 (compatible; FlatLogicVanillaDb/1.0; +https://uexcorp.space/)';
+ $batchSize = 4;
- foreach ($uniqueNames as $name) {
- $url = 'https://uexcorp.space/search?q=' . rawurlencode($name);
- $handle = curl_init();
- curl_setopt_array($handle, [
- CURLOPT_URL => $url,
- CURLOPT_RETURNTRANSFER => true,
- CURLOPT_FOLLOWLOCATION => true,
- CURLOPT_CONNECTTIMEOUT => 4,
- CURLOPT_TIMEOUT => 8,
- CURLOPT_USERAGENT => $userAgent,
- CURLOPT_SSL_VERIFYPEER => true,
- CURLOPT_SSL_VERIFYHOST => 2,
- CURLOPT_HTTPHEADER => [
- 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
- 'Accept-Language: fr-FR,fr;q=0.9,en;q=0.8',
- 'Cache-Control: no-cache',
- ],
- ]);
+ foreach (array_chunk(array_values($uniqueNames), $batchSize) as $nameBatch) {
+ $multiHandle = curl_multi_init();
+ $handles = [];
- curl_multi_add_handle($multiHandle, $handle);
- $handles[$name] = $handle;
- }
-
- $running = null;
- do {
- $status = curl_multi_exec($multiHandle, $running);
- if ($running) {
- curl_multi_select($multiHandle, 1.0);
- }
- } while ($running && $status === CURLM_OK);
-
- foreach ($handles as $name => $handle) {
- $error = curl_error($handle);
- $httpCode = (int) curl_getinfo($handle, CURLINFO_RESPONSE_CODE);
- $body = (string) curl_multi_getcontent($handle);
-
- if ($error !== '' || $httpCode < 200 || $httpCode >= 300 || trim($body) === '') {
- $results[$name] = [
- 'has_estimate' => false,
- 'average' => null,
- 'formatted' => 'Indisponible',
- 'sample_count' => 0,
- 'error' => true,
+ foreach ($nameBatch as $name) {
+ $url = 'https://uexcorp.space/search?q=' . rawurlencode($name);
+ $handle = index_vanilla_uex_create_handle($url, $userAgent, 6, 12);
+ curl_multi_add_handle($multiHandle, $handle);
+ $handles[$name] = [
+ 'handle' => $handle,
+ 'url' => $url,
];
- } else {
+ }
+
+ $running = null;
+ do {
+ $status = curl_multi_exec($multiHandle, $running);
+ if ($running) {
+ curl_multi_select($multiHandle, 1.0);
+ }
+ } while ($running && $status === CURLM_OK);
+
+ foreach ($handles as $name => $payload) {
+ $handle = $payload['handle'];
+ $url = $payload['url'];
+ $error = curl_error($handle);
+ $httpCode = (int) curl_getinfo($handle, CURLINFO_RESPONSE_CODE);
+ $body = (string) curl_multi_getcontent($handle);
+
+ curl_multi_remove_handle($multiHandle, $handle);
+ curl_close($handle);
+
+ if ($error !== '' || $httpCode < 200 || $httpCode >= 300 || trim($body) === '') {
+ $body = index_vanilla_uex_fetch_single_body($url, $userAgent) ?? '';
+ }
+
+ if (trim($body) === '') {
+ $results[$name] = [
+ 'has_estimate' => false,
+ 'average' => null,
+ 'formatted' => 'Indisponible',
+ 'sample_count' => 0,
+ 'error' => true,
+ ];
+ continue;
+ }
+
$results[$name] = index_vanilla_uex_parse_estimate_from_html($body, $name, $sampleLimit);
$results[$name]['error'] = false;
}
- curl_multi_remove_handle($multiHandle, $handle);
- curl_close($handle);
+ curl_multi_close($multiHandle);
}
- curl_multi_close($multiHandle);
-
return $results;
}
diff --git a/index.php b/index.php
index 67e8533..90c0b78 100644
--- a/index.php
+++ b/index.php
@@ -85,34 +85,98 @@ function index_vanilla_uex_normalize_search_text(string $value): string
return trim((string) preg_replace('/\s+/u', ' ', $value));
}
-function index_vanilla_uex_title_matches_query(string $title, string $queryName): bool
+function index_vanilla_uex_normalize_compact_search_text(string $value): string
{
- $normalizedTitle = index_vanilla_uex_normalize_search_text($title);
- $normalizedQuery = index_vanilla_uex_normalize_search_text($queryName);
+ $value = function_exists('mb_strtolower')
+ ? mb_strtolower($value, 'UTF-8')
+ : strtolower($value);
- if ($normalizedTitle === '' || $normalizedQuery === '') {
- return false;
+ return trim((string) preg_replace('/[^[:alnum:]]+/u', '', $value));
+}
+
+function index_vanilla_uex_tokenize_search_text(string $value): array
+{
+ $normalizedValue = index_vanilla_uex_normalize_search_text($value);
+ if ($normalizedValue === '') {
+ return [];
}
- if (strpos($normalizedTitle, $normalizedQuery) !== false) {
- return true;
- }
-
- $queryTokens = array_values(array_filter(explode(' ', $normalizedQuery), static function (string $token): bool {
- return preg_match('/\d/', $token) || strlen($token) >= 3;
+ $tokens = array_values(array_filter(explode(' ', $normalizedValue), static function (string $token): bool {
+ return preg_match('/\d/u', $token) || strlen($token) >= 2;
}));
+ return array_values(array_unique($tokens));
+}
+
+function index_vanilla_uex_extract_wts_subject(string $title): string
+{
+ $normalizedTitle = index_vanilla_uex_normalize_search_text($title);
+ if ($normalizedTitle === '') {
+ return '';
+ }
+
+ if (!preg_match('/\bwts\b/u', $normalizedTitle, $matches, PREG_OFFSET_CAPTURE)) {
+ return '';
+ }
+
+ $matchText = (string) ($matches[0][0] ?? 'wts');
+ $matchOffset = (int) ($matches[0][1] ?? 0);
+ $subject = trim(substr($normalizedTitle, $matchOffset + strlen($matchText)));
+
+ while ($subject !== '') {
+ $updated = trim((string) preg_replace('/^(?:wts|wtb|lts|ltb|sell|selling|trade|trading|for|offer|offering|looking|lf|want|wanted)\b[\s\-:]*/u', '', $subject));
+ if ($updated === $subject) {
+ break;
+ }
+
+ $subject = $updated;
+ }
+
+ $subject = trim((string) preg_replace('/^(?:[a-z]{1,4}\s+)?\d+(?:\s+\d+)*(?:\s*[\-:])\s*/u', '', $subject));
+ return $subject;
+}
+
+function index_vanilla_uex_title_has_wts_marker(string $title): bool
+{
+ return index_vanilla_uex_extract_wts_subject($title) !== '';
+}
+
+function index_vanilla_uex_title_match_score(string $title, string $queryName): int
+{
+ $normalizedTitle = index_vanilla_uex_extract_wts_subject($title);
+ $normalizedQuery = index_vanilla_uex_normalize_search_text($queryName);
+ $compactTitle = index_vanilla_uex_normalize_compact_search_text($normalizedTitle);
+ $compactQuery = index_vanilla_uex_normalize_compact_search_text($normalizedQuery);
+
+ if ($normalizedTitle === '' || $normalizedQuery === '' || $compactTitle === '' || $compactQuery === '') {
+ return 0;
+ }
+
+ if ($normalizedTitle === $normalizedQuery || $compactTitle === $compactQuery) {
+ return 3;
+ }
+
+ if (strpos($normalizedTitle, $normalizedQuery) !== false || strpos($compactTitle, $compactQuery) !== false) {
+ return 2;
+ }
+
+ $queryTokens = index_vanilla_uex_tokenize_search_text($normalizedQuery);
if ($queryTokens === []) {
- return false;
+ return 0;
}
foreach ($queryTokens as $token) {
if (strpos($normalizedTitle, $token) === false) {
- return false;
+ return 0;
}
}
- return true;
+ return 1;
+}
+
+function index_vanilla_uex_title_matches_query(string $title, string $queryName): bool
+{
+ return index_vanilla_uex_title_match_score($title, $queryName) > 0;
}
function index_vanilla_uex_extract_price_value(string $rawPrice): ?int
@@ -169,20 +233,21 @@ function index_vanilla_uex_extract_price_value(string $rawPrice): ?int
function index_vanilla_uex_parse_estimate_from_html(string $html, string $queryName, int $sampleLimit = 10): array
{
- $values = [];
+ $candidates = [];
$chunks = preg_split('/]*>/i', $html) ?: [];
foreach ($chunks as $chunk) {
- if (count($values) >= $sampleLimit) {
- break;
- }
-
if (!preg_match('/
]*class="text-bold"[^>]*>(.*?)<\/a>/is', $chunk, $titleMatches)) {
continue;
}
$title = index_vanilla_uex_normalize_whitespace((string) ($titleMatches[1] ?? ''));
- if ($title === '' || !preg_match('/^WTS\b/i', $title) || !index_vanilla_uex_title_matches_query($title, $queryName)) {
+ if ($title === '' || !index_vanilla_uex_title_has_wts_marker($title)) {
+ continue;
+ }
+
+ $matchScore = index_vanilla_uex_title_match_score($title, $queryName);
+ if ($matchScore <= 0) {
continue;
}
@@ -195,7 +260,34 @@ function index_vanilla_uex_parse_estimate_from_html(string $html, string $queryN
continue;
}
- $values[] = $priceValue;
+ $candidates[] = [
+ 'price' => $priceValue,
+ 'score' => $matchScore,
+ ];
+ }
+
+ if ($candidates === []) {
+ return [
+ 'has_estimate' => false,
+ 'average' => null,
+ 'formatted' => '—',
+ 'sample_count' => 0,
+ ];
+ }
+
+ $bestScore = max(array_column($candidates, 'score'));
+ $minimumAcceptedScore = $bestScore >= 3 ? 2 : $bestScore;
+ $values = [];
+
+ foreach ($candidates as $candidate) {
+ if (($candidate['score'] ?? 0) < $minimumAcceptedScore) {
+ continue;
+ }
+
+ $values[] = (int) $candidate['price'];
+ if (count($values) >= $sampleLimit) {
+ break;
+ }
}
if ($values === []) {
@@ -217,6 +309,44 @@ function index_vanilla_uex_parse_estimate_from_html(string $html, string $queryN
];
}
+function index_vanilla_uex_create_handle(string $url, string $userAgent, int $connectTimeout, int $timeout)
+{
+ $handle = curl_init();
+ curl_setopt_array($handle, [
+ CURLOPT_URL => $url,
+ CURLOPT_RETURNTRANSFER => true,
+ CURLOPT_FOLLOWLOCATION => true,
+ CURLOPT_CONNECTTIMEOUT => $connectTimeout,
+ CURLOPT_TIMEOUT => $timeout,
+ CURLOPT_USERAGENT => $userAgent,
+ CURLOPT_ENCODING => '',
+ CURLOPT_SSL_VERIFYPEER => true,
+ CURLOPT_SSL_VERIFYHOST => 2,
+ CURLOPT_HTTPHEADER => [
+ 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+ 'Accept-Language: fr-FR,fr;q=0.9,en;q=0.8',
+ 'Cache-Control: no-cache',
+ ],
+ ]);
+
+ return $handle;
+}
+
+function index_vanilla_uex_fetch_single_body(string $url, string $userAgent, int $connectTimeout = 8, int $timeout = 18): ?string
+{
+ $handle = index_vanilla_uex_create_handle($url, $userAgent, $connectTimeout, $timeout);
+ $body = curl_exec($handle);
+ $error = curl_error($handle);
+ $httpCode = (int) curl_getinfo($handle, CURLINFO_RESPONSE_CODE);
+ curl_close($handle);
+
+ if (!is_string($body) || $error !== '' || $httpCode < 200 || $httpCode >= 300 || trim($body) === '') {
+ return null;
+ }
+
+ return $body;
+}
+
function index_vanilla_uex_fetch_estimates(array $names, int $sampleLimit = 10): array
{
$results = [];
@@ -235,65 +365,63 @@ function index_vanilla_uex_fetch_estimates(array $names, int $sampleLimit = 10):
return $results;
}
- $multiHandle = curl_multi_init();
- $handles = [];
$userAgent = 'Mozilla/5.0 (compatible; FlatLogicVanillaDb/1.0; +https://uexcorp.space/)';
+ $batchSize = 4;
- foreach ($uniqueNames as $name) {
- $url = 'https://uexcorp.space/search?q=' . rawurlencode($name);
- $handle = curl_init();
- curl_setopt_array($handle, [
- CURLOPT_URL => $url,
- CURLOPT_RETURNTRANSFER => true,
- CURLOPT_FOLLOWLOCATION => true,
- CURLOPT_CONNECTTIMEOUT => 4,
- CURLOPT_TIMEOUT => 8,
- CURLOPT_USERAGENT => $userAgent,
- CURLOPT_SSL_VERIFYPEER => true,
- CURLOPT_SSL_VERIFYHOST => 2,
- CURLOPT_HTTPHEADER => [
- 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
- 'Accept-Language: fr-FR,fr;q=0.9,en;q=0.8',
- 'Cache-Control: no-cache',
- ],
- ]);
+ foreach (array_chunk(array_values($uniqueNames), $batchSize) as $nameBatch) {
+ $multiHandle = curl_multi_init();
+ $handles = [];
- curl_multi_add_handle($multiHandle, $handle);
- $handles[$name] = $handle;
- }
-
- $running = null;
- do {
- $status = curl_multi_exec($multiHandle, $running);
- if ($running) {
- curl_multi_select($multiHandle, 1.0);
- }
- } while ($running && $status === CURLM_OK);
-
- foreach ($handles as $name => $handle) {
- $error = curl_error($handle);
- $httpCode = (int) curl_getinfo($handle, CURLINFO_RESPONSE_CODE);
- $body = (string) curl_multi_getcontent($handle);
-
- if ($error !== '' || $httpCode < 200 || $httpCode >= 300 || trim($body) === '') {
- $results[$name] = [
- 'has_estimate' => false,
- 'average' => null,
- 'formatted' => 'Indisponible',
- 'sample_count' => 0,
- 'error' => true,
+ foreach ($nameBatch as $name) {
+ $url = 'https://uexcorp.space/search?q=' . rawurlencode($name);
+ $handle = index_vanilla_uex_create_handle($url, $userAgent, 6, 12);
+ curl_multi_add_handle($multiHandle, $handle);
+ $handles[$name] = [
+ 'handle' => $handle,
+ 'url' => $url,
];
- } else {
+ }
+
+ $running = null;
+ do {
+ $status = curl_multi_exec($multiHandle, $running);
+ if ($running) {
+ curl_multi_select($multiHandle, 1.0);
+ }
+ } while ($running && $status === CURLM_OK);
+
+ foreach ($handles as $name => $payload) {
+ $handle = $payload['handle'];
+ $url = $payload['url'];
+ $error = curl_error($handle);
+ $httpCode = (int) curl_getinfo($handle, CURLINFO_RESPONSE_CODE);
+ $body = (string) curl_multi_getcontent($handle);
+
+ curl_multi_remove_handle($multiHandle, $handle);
+ curl_close($handle);
+
+ if ($error !== '' || $httpCode < 200 || $httpCode >= 300 || trim($body) === '') {
+ $body = index_vanilla_uex_fetch_single_body($url, $userAgent) ?? '';
+ }
+
+ if (trim($body) === '') {
+ $results[$name] = [
+ 'has_estimate' => false,
+ 'average' => null,
+ 'formatted' => 'Indisponible',
+ 'sample_count' => 0,
+ 'error' => true,
+ ];
+ continue;
+ }
+
$results[$name] = index_vanilla_uex_parse_estimate_from_html($body, $name, $sampleLimit);
$results[$name]['error'] = false;
}
- curl_multi_remove_handle($multiHandle, $handle);
- curl_close($handle);
+ curl_multi_close($multiHandle);
}
- curl_multi_close($multiHandle);
-
return $results;
}