Revert to version d67465d
This commit is contained in:
parent
6cb783b9fb
commit
7957a70985
@ -12,6 +12,3 @@ EMAIL_USER=AKIAVEW7G4PQUBGM52OF
|
|||||||
EMAIL_PASS=BLnD4hKGb6YkSz3gaQrf8fnyLi3C3/EdjOOsLEDTDPTz
|
EMAIL_PASS=BLnD4hKGb6YkSz3gaQrf8fnyLi3C3/EdjOOsLEDTDPTz
|
||||||
SECRET_KEY=HUEyqESqgQ1yTwzVlO6wprC9Kf1J1xuA
|
SECRET_KEY=HUEyqESqgQ1yTwzVlO6wprC9Kf1J1xuA
|
||||||
PEXELS_KEY=Vc99rnmOhHhJAbgGQoKLZtsaIVfkeownoQNbTj78VemUjKh08ZYRbf18
|
PEXELS_KEY=Vc99rnmOhHhJAbgGQoKLZtsaIVfkeownoQNbTj78VemUjKh08ZYRbf18
|
||||||
FIRECRAWL_API_KEY=fc-409763513f6c458c9d1d09e460346b17
|
|
||||||
FIRECRAWL_BASE_URL=https://api.firecrawl.dev/v2
|
|
||||||
FIRECRAWL_ENABLED=true
|
|
||||||
|
|||||||
@ -1,49 +1,4 @@
|
|||||||
const fs = require('fs');
|
const FIRECRAWL_DEFAULT_BASE_URL = 'https://api.firecrawl.dev/v1';
|
||||||
const path = require('path');
|
|
||||||
const axios = require('axios');
|
|
||||||
|
|
||||||
const FIRECRAWL_DEFAULT_BASE_URL = 'https://api.firecrawl.dev/v2';
|
|
||||||
const FIRECRAWL_DEFAULT_POLL_INTERVAL_MS = 2000;
|
|
||||||
const FIRECRAWL_DEFAULT_TIMEOUT_MS = 45000;
|
|
||||||
|
|
||||||
const BACKEND_ENV_PATH = path.join(__dirname, '..', '..', '.env');
|
|
||||||
|
|
||||||
function readBackendEnvFile() {
|
|
||||||
try {
|
|
||||||
const raw = fs.readFileSync(BACKEND_ENV_PATH, 'utf8');
|
|
||||||
|
|
||||||
return raw.split(/\r?\n/).reduce((accumulator, line) => {
|
|
||||||
const trimmedLine = line.trim();
|
|
||||||
|
|
||||||
if (!trimmedLine || trimmedLine.startsWith('#')) {
|
|
||||||
return accumulator;
|
|
||||||
}
|
|
||||||
|
|
||||||
const separatorIndex = trimmedLine.indexOf('=');
|
|
||||||
|
|
||||||
if (separatorIndex === -1) {
|
|
||||||
return accumulator;
|
|
||||||
}
|
|
||||||
|
|
||||||
const key = trimmedLine.slice(0, separatorIndex).trim();
|
|
||||||
const value = trimmedLine.slice(separatorIndex + 1).trim();
|
|
||||||
|
|
||||||
accumulator[key] = value.replace(/^"|"$/g, '').replace(/^'|'$/g, '');
|
|
||||||
return accumulator;
|
|
||||||
}, {});
|
|
||||||
} catch (error) {
|
|
||||||
return {};
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
function getEnvValue(name) {
|
|
||||||
if (process.env[name] !== undefined && process.env[name] !== null && process.env[name] !== '') {
|
|
||||||
return process.env[name];
|
|
||||||
}
|
|
||||||
|
|
||||||
return readBackendEnvFile()[name];
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
function toBoolean(value, defaultValue = false) {
|
function toBoolean(value, defaultValue = false) {
|
||||||
if (value === undefined || value === null || value === '') {
|
if (value === undefined || value === null || value === '') {
|
||||||
@ -67,34 +22,12 @@ function toBoolean(value, defaultValue = false) {
|
|||||||
return defaultValue;
|
return defaultValue;
|
||||||
}
|
}
|
||||||
|
|
||||||
function toPositiveInteger(value, defaultValue) {
|
|
||||||
const parsed = Number(value);
|
|
||||||
|
|
||||||
if (Number.isInteger(parsed) && parsed > 0) {
|
|
||||||
return parsed;
|
|
||||||
}
|
|
||||||
|
|
||||||
return defaultValue;
|
|
||||||
}
|
|
||||||
|
|
||||||
function normalizeBaseUrl(baseUrl) {
|
|
||||||
return String(baseUrl || FIRECRAWL_DEFAULT_BASE_URL)
|
|
||||||
.trim()
|
|
||||||
.replace(/\/+$/, '');
|
|
||||||
}
|
|
||||||
|
|
||||||
function getFirecrawlRuntime() {
|
function getFirecrawlRuntime() {
|
||||||
const apiKey = String(getEnvValue('FIRECRAWL_API_KEY') || '').trim();
|
const apiKey = String(process.env.FIRECRAWL_API_KEY || '').trim();
|
||||||
const baseUrl = normalizeBaseUrl(getEnvValue('FIRECRAWL_BASE_URL'));
|
const baseUrl = String(
|
||||||
const enabled = toBoolean(getEnvValue('FIRECRAWL_ENABLED'), true);
|
process.env.FIRECRAWL_BASE_URL || FIRECRAWL_DEFAULT_BASE_URL,
|
||||||
const pollIntervalMs = toPositiveInteger(
|
).trim();
|
||||||
getEnvValue('FIRECRAWL_POLL_INTERVAL_MS'),
|
const enabled = toBoolean(process.env.FIRECRAWL_ENABLED, true);
|
||||||
FIRECRAWL_DEFAULT_POLL_INTERVAL_MS,
|
|
||||||
);
|
|
||||||
const timeoutMs = toPositiveInteger(
|
|
||||||
getEnvValue('FIRECRAWL_TIMEOUT_MS'),
|
|
||||||
FIRECRAWL_DEFAULT_TIMEOUT_MS,
|
|
||||||
);
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
provider: 'firecrawl',
|
provider: 'firecrawl',
|
||||||
@ -102,255 +35,29 @@ function getFirecrawlRuntime() {
|
|||||||
enabled,
|
enabled,
|
||||||
configured: Boolean(apiKey),
|
configured: Boolean(apiKey),
|
||||||
hasApiKey: Boolean(apiKey),
|
hasApiKey: Boolean(apiKey),
|
||||||
apiKey,
|
mode: 'scaffold_only',
|
||||||
pollIntervalMs,
|
|
||||||
timeoutMs,
|
|
||||||
mode: enabled && apiKey ? 'active' : 'scaffold_only',
|
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
function buildFirecrawlMessage(runtime, entitlements, requestedPages) {
|
|
||||||
if (!entitlements?.canAdvancedCrawl) {
|
|
||||||
return 'Firecrawl is reserved for paid Advanced Crawl users. This request will stay on the built-in crawler.';
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!runtime.enabled) {
|
|
||||||
return 'Firecrawl is configured in code, but FIRECRAWL_ENABLED is turned off. Paid users will stay on the built-in crawler until it is enabled.';
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!runtime.configured) {
|
|
||||||
return 'Firecrawl is enabled for paid users, but FIRECRAWL_API_KEY is missing. Falling back to the built-in crawler until the key is configured.';
|
|
||||||
}
|
|
||||||
|
|
||||||
return requestedPages > 1
|
|
||||||
? 'Paid Advanced Crawl users are routed through Firecrawl for sitemap-aware, JavaScript-rendered multi-page crawling.'
|
|
||||||
: 'Paid Advanced Crawl users are routed through Firecrawl for sitemap-aware, JavaScript-rendered crawling.';
|
|
||||||
}
|
|
||||||
|
|
||||||
function getFirecrawlScaffold({ requestedPages, entitlements } = {}) {
|
function getFirecrawlScaffold({ requestedPages, entitlements } = {}) {
|
||||||
const runtime = getFirecrawlRuntime();
|
const runtime = getFirecrawlRuntime();
|
||||||
const availableForCurrentUser = Boolean(entitlements?.canAdvancedCrawl);
|
const wantsAdvancedCrawl = Number(requestedPages || 1) > 1;
|
||||||
const shouldUseFirecrawl = Boolean(
|
const advancedCrawlUnlocked = Boolean(entitlements?.canAdvancedCrawl);
|
||||||
availableForCurrentUser
|
const shouldUseFirecrawlLater = runtime.enabled && (wantsAdvancedCrawl || advancedCrawlUnlocked);
|
||||||
&& runtime.enabled
|
|
||||||
&& runtime.configured,
|
|
||||||
);
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
provider: 'firecrawl',
|
...runtime,
|
||||||
baseUrl: runtime.baseUrl,
|
status: runtime.configured ? 'ready_for_activation' : 'awaiting_api_key',
|
||||||
enabled: runtime.enabled,
|
|
||||||
configured: runtime.configured,
|
|
||||||
hasApiKey: runtime.hasApiKey,
|
|
||||||
mode: shouldUseFirecrawl ? 'active' : runtime.mode,
|
|
||||||
status: shouldUseFirecrawl ? 'active_for_paid_users' : 'scaffold_only',
|
|
||||||
wouldHandleJavascript: true,
|
wouldHandleJavascript: true,
|
||||||
wouldHandleSitemapDiscovery: true,
|
wouldHandleSitemapDiscovery: true,
|
||||||
availableForCurrentUser,
|
shouldUseFirecrawlLater,
|
||||||
shouldUseFirecrawl,
|
message: runtime.configured
|
||||||
usePaidOnly: true,
|
? 'Firecrawl scaffold is wired and ready for the next activation step, but this analyzer still uses the built-in crawler today.'
|
||||||
message: buildFirecrawlMessage(runtime, entitlements, requestedPages),
|
: 'Firecrawl scaffold is wired, but FIRECRAWL_API_KEY is not set yet. The analyzer still uses the built-in crawler for now.',
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
function sleep(milliseconds) {
|
|
||||||
return new Promise((resolve) => {
|
|
||||||
setTimeout(resolve, milliseconds);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
function isAbsoluteUrl(value) {
|
|
||||||
return /^https?:\/\//i.test(String(value || ''));
|
|
||||||
}
|
|
||||||
|
|
||||||
function buildApiUrl(runtime, pathOrUrl) {
|
|
||||||
const value = String(pathOrUrl || '').trim();
|
|
||||||
|
|
||||||
if (!value) {
|
|
||||||
return runtime.baseUrl;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (isAbsoluteUrl(value)) {
|
|
||||||
return value;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (value.startsWith('/')) {
|
|
||||||
return `${runtime.baseUrl}${value}`;
|
|
||||||
}
|
|
||||||
|
|
||||||
return `${runtime.baseUrl}/${value}`;
|
|
||||||
}
|
|
||||||
|
|
||||||
function summarizeFirecrawlPayload(payload) {
|
|
||||||
if (!payload) {
|
|
||||||
return 'Unknown Firecrawl API error.';
|
|
||||||
}
|
|
||||||
|
|
||||||
if (typeof payload === 'string') {
|
|
||||||
return payload;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (typeof payload?.error === 'string' && payload.error.trim()) {
|
|
||||||
return payload.error;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (typeof payload?.message === 'string' && payload.message.trim()) {
|
|
||||||
return payload.message;
|
|
||||||
}
|
|
||||||
|
|
||||||
return 'Unexpected Firecrawl API response.';
|
|
||||||
}
|
|
||||||
|
|
||||||
async function firecrawlRequest(runtime, method, pathOrUrl, options = {}) {
|
|
||||||
try {
|
|
||||||
const response = await axios({
|
|
||||||
method,
|
|
||||||
url: buildApiUrl(runtime, pathOrUrl),
|
|
||||||
timeout: options.timeout || runtime.timeoutMs,
|
|
||||||
data: options.data,
|
|
||||||
headers: {
|
|
||||||
Authorization: `Bearer ${runtime.apiKey}`,
|
|
||||||
'Content-Type': 'application/json',
|
|
||||||
...(options.headers || {}),
|
|
||||||
},
|
|
||||||
});
|
|
||||||
|
|
||||||
return response.data;
|
|
||||||
} catch (error) {
|
|
||||||
if (axios.isAxiosError(error)) {
|
|
||||||
const payload = error.response?.data;
|
|
||||||
const detail = summarizeFirecrawlPayload(payload);
|
|
||||||
const status = error.response?.status;
|
|
||||||
const wrappedError = new Error(
|
|
||||||
status
|
|
||||||
? `Firecrawl request failed with status ${status}: ${detail}`
|
|
||||||
: `Firecrawl request failed: ${detail}`,
|
|
||||||
);
|
|
||||||
|
|
||||||
wrappedError.code = status || 502;
|
|
||||||
wrappedError.response = payload;
|
|
||||||
throw wrappedError;
|
|
||||||
}
|
|
||||||
|
|
||||||
throw error;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async function collectPagedStatus(runtime, initialStatus) {
|
|
||||||
const documents = Array.isArray(initialStatus?.data)
|
|
||||||
? [...initialStatus.data]
|
|
||||||
: [];
|
|
||||||
let nextUrl = initialStatus?.next || null;
|
|
||||||
|
|
||||||
while (nextUrl) {
|
|
||||||
const nextStatus = await firecrawlRequest(runtime, 'get', nextUrl);
|
|
||||||
|
|
||||||
if (Array.isArray(nextStatus?.data) && nextStatus.data.length > 0) {
|
|
||||||
documents.push(...nextStatus.data);
|
|
||||||
}
|
|
||||||
|
|
||||||
nextUrl = nextStatus?.next || null;
|
|
||||||
}
|
|
||||||
|
|
||||||
return {
|
|
||||||
...initialStatus,
|
|
||||||
data: documents,
|
|
||||||
next: null,
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
async function waitForCrawlCompletion(runtime, crawlId) {
|
|
||||||
const deadline = Date.now() + runtime.timeoutMs;
|
|
||||||
|
|
||||||
while (Date.now() <= deadline) {
|
|
||||||
const status = await firecrawlRequest(runtime, 'get', `/crawl/${encodeURIComponent(crawlId)}`);
|
|
||||||
|
|
||||||
if (status?.status === 'completed' || status?.status === 'failed') {
|
|
||||||
return collectPagedStatus(runtime, status);
|
|
||||||
}
|
|
||||||
|
|
||||||
await sleep(runtime.pollIntervalMs);
|
|
||||||
}
|
|
||||||
|
|
||||||
const timeoutError = new Error(
|
|
||||||
`Firecrawl crawl timed out after ${Math.round(runtime.timeoutMs / 1000)} seconds.`,
|
|
||||||
);
|
|
||||||
timeoutError.code = 504;
|
|
||||||
throw timeoutError;
|
|
||||||
}
|
|
||||||
|
|
||||||
async function getCrawlErrors(runtime, crawlId) {
|
|
||||||
try {
|
|
||||||
return await firecrawlRequest(runtime, 'get', `/crawl/${encodeURIComponent(crawlId)}/errors`);
|
|
||||||
} catch (error) {
|
|
||||||
console.error('Failed to fetch Firecrawl crawl errors:', error);
|
|
||||||
return {
|
|
||||||
errors: [],
|
|
||||||
robotsBlocked: [],
|
|
||||||
};
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async function crawlSiteWithFirecrawl(url, requestedPages) {
|
|
||||||
const runtime = getFirecrawlRuntime();
|
|
||||||
|
|
||||||
if (!runtime.enabled) {
|
|
||||||
const error = new Error('Firecrawl is disabled in this environment.');
|
|
||||||
error.code = 503;
|
|
||||||
throw error;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!runtime.configured) {
|
|
||||||
const error = new Error('Firecrawl API key is not configured.');
|
|
||||||
error.code = 503;
|
|
||||||
throw error;
|
|
||||||
}
|
|
||||||
|
|
||||||
const started = await firecrawlRequest(runtime, 'post', '/crawl', {
|
|
||||||
data: {
|
|
||||||
url,
|
|
||||||
limit: requestedPages,
|
|
||||||
sitemap: 'include',
|
|
||||||
crawlEntireDomain: true,
|
|
||||||
allowExternalLinks: false,
|
|
||||||
allowSubdomains: false,
|
|
||||||
ignoreQueryParameters: true,
|
|
||||||
scrapeOptions: {
|
|
||||||
formats: ['html'],
|
|
||||||
},
|
|
||||||
},
|
|
||||||
});
|
|
||||||
|
|
||||||
const crawlId = started?.id;
|
|
||||||
|
|
||||||
if (!crawlId) {
|
|
||||||
const error = new Error('Firecrawl did not return a crawl job ID.');
|
|
||||||
error.code = 502;
|
|
||||||
error.response = started;
|
|
||||||
throw error;
|
|
||||||
}
|
|
||||||
|
|
||||||
const status = await waitForCrawlCompletion(runtime, crawlId);
|
|
||||||
const crawlErrors = await getCrawlErrors(runtime, crawlId);
|
|
||||||
|
|
||||||
return {
|
|
||||||
crawlId,
|
|
||||||
provider: 'firecrawl',
|
|
||||||
status: status?.status || 'unknown',
|
|
||||||
total: status?.total || 0,
|
|
||||||
completed: status?.completed || 0,
|
|
||||||
creditsUsed: status?.creditsUsed || 0,
|
|
||||||
expiresAt: status?.expiresAt || null,
|
|
||||||
data: Array.isArray(status?.data) ? status.data : [],
|
|
||||||
errors: Array.isArray(crawlErrors?.errors) ? crawlErrors.errors : [],
|
|
||||||
robotsBlocked: Array.isArray(crawlErrors?.robotsBlocked)
|
|
||||||
? crawlErrors.robotsBlocked
|
|
||||||
: [],
|
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
module.exports = {
|
module.exports = {
|
||||||
getFirecrawlRuntime,
|
getFirecrawlRuntime,
|
||||||
getFirecrawlScaffold,
|
getFirecrawlScaffold,
|
||||||
crawlSiteWithFirecrawl,
|
|
||||||
};
|
};
|
||||||
|
|||||||
@ -1,6 +1,6 @@
|
|||||||
const ValidationError = require('./notifications/errors/validation');
|
const ValidationError = require('./notifications/errors/validation');
|
||||||
|
|
||||||
const BASIC_MAX_PAGES_PER_CRAWL = 25;
|
const BASIC_MAX_PAGES_PER_CRAWL = 1;
|
||||||
const ADVANCED_MAX_PAGES_PER_CRAWL = 25;
|
const ADVANCED_MAX_PAGES_PER_CRAWL = 25;
|
||||||
const ADVANCED_CRAWL_PERMISSION = 'USE_ADVANCED_CRAWL';
|
const ADVANCED_CRAWL_PERMISSION = 'USE_ADVANCED_CRAWL';
|
||||||
const PLATFORM_OUTPUT_PERMISSION = 'USE_PLATFORM_OUTPUT';
|
const PLATFORM_OUTPUT_PERMISSION = 'USE_PLATFORM_OUTPUT';
|
||||||
@ -68,7 +68,7 @@ function ensureRequestedPagesAllowed(requestedPages, currentUser) {
|
|||||||
|
|
||||||
if (requestedPages > entitlements.maxPagesPerCrawl) {
|
if (requestedPages > entitlements.maxPagesPerCrawl) {
|
||||||
const error = new Error(
|
const error = new Error(
|
||||||
`This analyzer supports up to ${entitlements.maxPagesPerCrawl} page${entitlements.maxPagesPerCrawl === 1 ? '' : 's'} per crawl. Reduce the requested page count to continue.`,
|
`Your current plan allows up to ${entitlements.maxPagesPerCrawl} page${entitlements.maxPagesPerCrawl === 1 ? '' : 's'} per crawl. Upgrade to Advanced Crawl to analyze ${requestedPages} pages.`,
|
||||||
);
|
);
|
||||||
error.code = 403;
|
error.code = 403;
|
||||||
throw error;
|
throw error;
|
||||||
|
|||||||
@ -8,7 +8,7 @@ const {
|
|||||||
ensureRequestedPagesAllowed,
|
ensureRequestedPagesAllowed,
|
||||||
ensurePlatformOutputAllowed,
|
ensurePlatformOutputAllowed,
|
||||||
} = require('./siteEntitlements');
|
} = require('./siteEntitlements');
|
||||||
const { getFirecrawlScaffold, crawlSiteWithFirecrawl } = require('./firecrawl');
|
const { getFirecrawlScaffold } = require('./firecrawl');
|
||||||
|
|
||||||
const REQUEST_TIMEOUT = 15000;
|
const REQUEST_TIMEOUT = 15000;
|
||||||
const PREVIEW_LIMIT = 5;
|
const PREVIEW_LIMIT = 5;
|
||||||
@ -271,152 +271,6 @@ function normalizeAllowedHostnames(allowedHostnames) {
|
|||||||
return new Set();
|
return new Set();
|
||||||
}
|
}
|
||||||
|
|
||||||
function normalizeTargetPathname(pathname) {
|
|
||||||
const trimmedPathname = String(pathname || '').trim();
|
|
||||||
|
|
||||||
if (!trimmedPathname || trimmedPathname === '/') {
|
|
||||||
return '/';
|
|
||||||
}
|
|
||||||
|
|
||||||
return `/${trimmedPathname.replace(/^\/+/, '').replace(/\/+$/, '')}`;
|
|
||||||
}
|
|
||||||
|
|
||||||
function buildCrawlTarget(rawTarget, baseUrl, label) {
|
|
||||||
const trimmedTarget = String(rawTarget || '').trim();
|
|
||||||
|
|
||||||
if (!trimmedTarget) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
let parsedTarget;
|
|
||||||
|
|
||||||
try {
|
|
||||||
if (/^https?:\/\//i.test(trimmedTarget)) {
|
|
||||||
parsedTarget = new URL(trimmedTarget);
|
|
||||||
} else if (trimmedTarget.startsWith('/')) {
|
|
||||||
parsedTarget = new URL(trimmedTarget, baseUrl);
|
|
||||||
} else {
|
|
||||||
parsedTarget = new URL(`/${trimmedTarget.replace(/^\/+/, '')}`, baseUrl);
|
|
||||||
}
|
|
||||||
} catch (error) {
|
|
||||||
const targetError = new Error(`Invalid ${label} target: ${trimmedTarget}`);
|
|
||||||
targetError.code = 400;
|
|
||||||
throw targetError;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!['http:', 'https:'].includes(parsedTarget.protocol)) {
|
|
||||||
const targetError = new Error(`Invalid ${label} target: ${trimmedTarget}`);
|
|
||||||
targetError.code = 400;
|
|
||||||
throw targetError;
|
|
||||||
}
|
|
||||||
|
|
||||||
const baseHostname = new URL(baseUrl).hostname.toLowerCase();
|
|
||||||
|
|
||||||
if (parsedTarget.hostname.toLowerCase() !== baseHostname) {
|
|
||||||
const targetError = new Error(
|
|
||||||
`${label} targets must stay on the same website as the analyzed URL.`,
|
|
||||||
);
|
|
||||||
targetError.code = 400;
|
|
||||||
throw targetError;
|
|
||||||
}
|
|
||||||
|
|
||||||
parsedTarget.hash = '';
|
|
||||||
parsedTarget.search = '';
|
|
||||||
|
|
||||||
const path = normalizeTargetPathname(parsedTarget.pathname);
|
|
||||||
const url = normalizeUrl(parsedTarget.toString());
|
|
||||||
|
|
||||||
return {
|
|
||||||
input: trimmedTarget,
|
|
||||||
label: /^https?:\/\//i.test(trimmedTarget) ? url : path,
|
|
||||||
path,
|
|
||||||
url,
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
function parseCrawlTargets(rawTargets, baseUrl, label) {
|
|
||||||
const targetValues = Array.isArray(rawTargets)
|
|
||||||
? rawTargets
|
|
||||||
: String(rawTargets || '').split(/\r?\n/);
|
|
||||||
const dedupedTargets = new Map();
|
|
||||||
|
|
||||||
targetValues
|
|
||||||
.map((targetValue) => String(targetValue || '').trim())
|
|
||||||
.filter(Boolean)
|
|
||||||
.forEach((targetValue) => {
|
|
||||||
const normalizedTarget = buildCrawlTarget(targetValue, baseUrl, label);
|
|
||||||
|
|
||||||
dedupedTargets.set(normalizedTarget.url, normalizedTarget);
|
|
||||||
});
|
|
||||||
|
|
||||||
return Array.from(dedupedTargets.values());
|
|
||||||
}
|
|
||||||
|
|
||||||
function normalizeCrawlTargets(data, baseUrl) {
|
|
||||||
return {
|
|
||||||
includeTargets: parseCrawlTargets(data?.includeTargets, baseUrl, 'include'),
|
|
||||||
excludeTargets: parseCrawlTargets(data?.excludeTargets, baseUrl, 'exclude'),
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
function isUrlMatchingTarget(candidateUrl, target) {
|
|
||||||
if (!candidateUrl || !target?.path) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
let parsedUrl;
|
|
||||||
|
|
||||||
try {
|
|
||||||
parsedUrl = new URL(normalizeUrl(candidateUrl));
|
|
||||||
} catch (error) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
const candidatePath = normalizeTargetPathname(parsedUrl.pathname);
|
|
||||||
|
|
||||||
if (target.path === '/') {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
return candidatePath === target.path || candidatePath.startsWith(`${target.path}/`);
|
|
||||||
}
|
|
||||||
|
|
||||||
function matchesAnyCrawlTarget(candidateUrl, targets = []) {
|
|
||||||
return targets.some((target) => isUrlMatchingTarget(candidateUrl, target));
|
|
||||||
}
|
|
||||||
|
|
||||||
function isUrlAllowedByCrawlTargets(candidateUrl, crawlTargets = {}) {
|
|
||||||
const includeTargets = crawlTargets.includeTargets || [];
|
|
||||||
const excludeTargets = crawlTargets.excludeTargets || [];
|
|
||||||
|
|
||||||
if (includeTargets.length > 0 && !matchesAnyCrawlTarget(candidateUrl, includeTargets)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (excludeTargets.length > 0 && matchesAnyCrawlTarget(candidateUrl, excludeTargets)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
function buildSeedUrls(baseUrl, crawlTargets = {}) {
|
|
||||||
const seedUrls = new Set([baseUrl]);
|
|
||||||
|
|
||||||
(crawlTargets.includeTargets || []).forEach((target) => {
|
|
||||||
seedUrls.add(target.url);
|
|
||||||
});
|
|
||||||
|
|
||||||
return Array.from(seedUrls);
|
|
||||||
}
|
|
||||||
|
|
||||||
function summarizeCrawlTargets(crawlTargets = {}) {
|
|
||||||
return {
|
|
||||||
includeTargets: (crawlTargets.includeTargets || []).map((target) => target.label),
|
|
||||||
excludeTargets: (crawlTargets.excludeTargets || []).map((target) => target.label),
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
function normalizeCrawlUrl(rawUrl, parentUrl, allowedHostnames) {
|
function normalizeCrawlUrl(rawUrl, parentUrl, allowedHostnames) {
|
||||||
if (!rawUrl || typeof rawUrl !== 'string') {
|
if (!rawUrl || typeof rawUrl !== 'string') {
|
||||||
return null;
|
return null;
|
||||||
@ -549,184 +403,12 @@ async function fetchAnalyzedPage(pageUrl, allowedHostnames) {
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
function analyzeFetchedPage({
|
async function crawlPages(baseUrl, requestedPages) {
|
||||||
requestedUrl,
|
|
||||||
analyzedUrl,
|
|
||||||
html,
|
|
||||||
statusCode,
|
|
||||||
headers = {},
|
|
||||||
allowedHostnames,
|
|
||||||
discoveredLinks = null,
|
|
||||||
pageTitle = null,
|
|
||||||
}) {
|
|
||||||
const normalizedAnalyzedUrl = normalizeUrl(analyzedUrl || requestedUrl);
|
|
||||||
const normalizedAllowedHostnames = normalizeAllowedHostnames(allowedHostnames);
|
|
||||||
const analyzedHostname = new URL(normalizedAnalyzedUrl).hostname.toLowerCase();
|
|
||||||
normalizedAllowedHostnames.add(analyzedHostname);
|
|
||||||
|
|
||||||
if (allowedHostnames instanceof Set) {
|
|
||||||
allowedHostnames.add(analyzedHostname);
|
|
||||||
}
|
|
||||||
|
|
||||||
const resolvedHtml = typeof html === 'string' ? html : '';
|
|
||||||
const resolvedPageTitle = pageTitle || extractPageTitle(resolvedHtml);
|
|
||||||
const platform = detectPlatform(resolvedHtml, headers, normalizedAnalyzedUrl);
|
|
||||||
const schema = extractSchemaSummary(resolvedHtml);
|
|
||||||
const pageSignals = inferPageSignals(
|
|
||||||
resolvedHtml,
|
|
||||||
normalizedAnalyzedUrl,
|
|
||||||
resolvedPageTitle,
|
|
||||||
platform,
|
|
||||||
);
|
|
||||||
const normalizedLinks = Array.isArray(discoveredLinks)
|
|
||||||
? Array.from(
|
|
||||||
new Set(
|
|
||||||
discoveredLinks
|
|
||||||
.map((linkUrl) => normalizeCrawlUrl(linkUrl, normalizedAnalyzedUrl, normalizedAllowedHostnames))
|
|
||||||
.filter(Boolean),
|
|
||||||
),
|
|
||||||
)
|
|
||||||
: extractInternalLinks(
|
|
||||||
resolvedHtml,
|
|
||||||
normalizedAnalyzedUrl,
|
|
||||||
normalizedAllowedHostnames,
|
|
||||||
);
|
|
||||||
|
|
||||||
return {
|
|
||||||
requestedUrl: requestedUrl || normalizedAnalyzedUrl,
|
|
||||||
analyzedUrl: normalizedAnalyzedUrl,
|
|
||||||
pageTitle: resolvedPageTitle,
|
|
||||||
statusCode: statusCode || null,
|
|
||||||
html: resolvedHtml,
|
|
||||||
platform,
|
|
||||||
schema,
|
|
||||||
pageSignals,
|
|
||||||
discoveredLinks: normalizedLinks,
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
function transformFirecrawlDocument(document, allowedHostnames) {
|
|
||||||
const metadata = document?.metadata || {};
|
|
||||||
const sourceUrl =
|
|
||||||
metadata.sourceURL
|
|
||||||
|| metadata.sourceUrl
|
|
||||||
|| metadata.url
|
|
||||||
|| document?.url
|
|
||||||
|| document?.sourceURL
|
|
||||||
|| document?.sourceUrl;
|
|
||||||
|
|
||||||
if (!sourceUrl) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
const html =
|
|
||||||
typeof document?.html === 'string'
|
|
||||||
? document.html
|
|
||||||
: typeof document?.rawHtml === 'string'
|
|
||||||
? document.rawHtml
|
|
||||||
: typeof document?.content === 'string'
|
|
||||||
? document.content
|
|
||||||
: '';
|
|
||||||
|
|
||||||
return analyzeFetchedPage({
|
|
||||||
requestedUrl: sourceUrl,
|
|
||||||
analyzedUrl: sourceUrl,
|
|
||||||
html,
|
|
||||||
statusCode: document?.metadata?.statusCode || 200,
|
|
||||||
headers: {},
|
|
||||||
allowedHostnames,
|
|
||||||
discoveredLinks: Array.isArray(document?.links) ? document.links : null,
|
|
||||||
pageTitle: metadata.title || null,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
async function crawlPagesWithFirecrawl(baseUrl, requestedPages, crawlTargets = {}) {
|
|
||||||
const normalizedBaseUrl = normalizeUrl(baseUrl);
|
const normalizedBaseUrl = normalizeUrl(baseUrl);
|
||||||
const allowedHostnames = new Set([new URL(normalizedBaseUrl).hostname.toLowerCase()]);
|
const allowedHostnames = new Set([new URL(normalizedBaseUrl).hostname.toLowerCase()]);
|
||||||
const firecrawlResult = await crawlSiteWithFirecrawl(normalizedBaseUrl, requestedPages);
|
|
||||||
const pages = [];
|
|
||||||
const analyzedUrls = new Set();
|
|
||||||
|
|
||||||
(firecrawlResult.data || []).forEach((document) => {
|
|
||||||
try {
|
|
||||||
const page = transformFirecrawlDocument(document, allowedHostnames);
|
|
||||||
|
|
||||||
if (
|
|
||||||
!page
|
|
||||||
|| analyzedUrls.has(page.analyzedUrl)
|
|
||||||
|| !isUrlAllowedByCrawlTargets(page.analyzedUrl, crawlTargets)
|
|
||||||
) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
analyzedUrls.add(page.analyzedUrl);
|
|
||||||
pages.push(page);
|
|
||||||
} catch (error) {
|
|
||||||
console.error('Failed to transform Firecrawl document:', error);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
const failedPages = [];
|
|
||||||
|
|
||||||
(firecrawlResult.errors || []).forEach((entry) => {
|
|
||||||
const failedUrl = normalizeCrawlUrl(
|
|
||||||
entry?.path || entry?.url || entry?.sourceURL || normalizedBaseUrl,
|
|
||||||
normalizedBaseUrl,
|
|
||||||
allowedHostnames,
|
|
||||||
) || normalizedBaseUrl;
|
|
||||||
|
|
||||||
if (!isUrlAllowedByCrawlTargets(failedUrl, crawlTargets)) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
failedPages.push({
|
|
||||||
url: failedUrl,
|
|
||||||
error: entry?.error || entry?.message || 'Firecrawl could not fetch this page.',
|
|
||||||
});
|
|
||||||
});
|
|
||||||
|
|
||||||
(firecrawlResult.robotsBlocked || []).forEach((entry) => {
|
|
||||||
const blockedUrl = normalizeCrawlUrl(
|
|
||||||
entry?.path || entry?.url || normalizedBaseUrl,
|
|
||||||
normalizedBaseUrl,
|
|
||||||
allowedHostnames,
|
|
||||||
) || normalizedBaseUrl;
|
|
||||||
|
|
||||||
if (!isUrlAllowedByCrawlTargets(blockedUrl, crawlTargets)) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
failedPages.push({
|
|
||||||
url: blockedUrl,
|
|
||||||
error: 'Blocked by robots.txt during Firecrawl crawl.',
|
|
||||||
});
|
|
||||||
});
|
|
||||||
|
|
||||||
return {
|
|
||||||
provider: 'firecrawl',
|
|
||||||
pages,
|
|
||||||
failedPages,
|
|
||||||
discoveredInternalPages: Math.max((firecrawlResult.total || pages.length) - 1, 0),
|
|
||||||
firecrawlJob: {
|
|
||||||
crawlId: firecrawlResult.crawlId,
|
|
||||||
status: firecrawlResult.status,
|
|
||||||
total: firecrawlResult.total,
|
|
||||||
completed: firecrawlResult.completed,
|
|
||||||
creditsUsed: firecrawlResult.creditsUsed,
|
|
||||||
expiresAt: firecrawlResult.expiresAt,
|
|
||||||
failedPages: failedPages.length,
|
|
||||||
},
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
async function crawlPages(baseUrl, requestedPages, crawlTargets = {}) {
|
|
||||||
const normalizedBaseUrl = normalizeUrl(baseUrl);
|
|
||||||
const allowedHostnames = new Set([new URL(normalizedBaseUrl).hostname.toLowerCase()]);
|
|
||||||
const seedUrls = buildSeedUrls(normalizedBaseUrl, crawlTargets);
|
|
||||||
const seedUrlSet = new Set(seedUrls);
|
|
||||||
const visitedUrls = new Set();
|
const visitedUrls = new Set();
|
||||||
const queuedUrls = new Set(seedUrls);
|
const queuedUrls = new Set([normalizedBaseUrl]);
|
||||||
const pendingUrls = [...seedUrls];
|
const pendingUrls = [normalizedBaseUrl];
|
||||||
const pages = [];
|
const pages = [];
|
||||||
const failedPages = [];
|
const failedPages = [];
|
||||||
let discoveredInternalPages = 0;
|
let discoveredInternalPages = 0;
|
||||||
@ -738,29 +420,15 @@ async function crawlPages(baseUrl, requestedPages, crawlTargets = {}) {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
const isBootstrapSeed = seedUrlSet.has(nextUrl) && nextUrl === normalizedBaseUrl;
|
|
||||||
|
|
||||||
if (!isBootstrapSeed && !isUrlAllowedByCrawlTargets(nextUrl, crawlTargets)) {
|
|
||||||
visitedUrls.add(nextUrl);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
visitedUrls.add(nextUrl);
|
visitedUrls.add(nextUrl);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const page = await fetchAnalyzedPage(nextUrl, allowedHostnames);
|
const page = await fetchAnalyzedPage(nextUrl, allowedHostnames);
|
||||||
visitedUrls.add(page.analyzedUrl);
|
visitedUrls.add(page.analyzedUrl);
|
||||||
queuedUrls.add(page.analyzedUrl);
|
queuedUrls.add(page.analyzedUrl);
|
||||||
|
pages.push(page);
|
||||||
if (isUrlAllowedByCrawlTargets(page.analyzedUrl, crawlTargets)) {
|
|
||||||
pages.push(page);
|
|
||||||
}
|
|
||||||
|
|
||||||
page.discoveredLinks.forEach((linkUrl) => {
|
page.discoveredLinks.forEach((linkUrl) => {
|
||||||
if (!isUrlAllowedByCrawlTargets(linkUrl, crawlTargets)) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!visitedUrls.has(linkUrl) && !queuedUrls.has(linkUrl)) {
|
if (!visitedUrls.has(linkUrl) && !queuedUrls.has(linkUrl)) {
|
||||||
queuedUrls.add(linkUrl);
|
queuedUrls.add(linkUrl);
|
||||||
pendingUrls.push(linkUrl);
|
pendingUrls.push(linkUrl);
|
||||||
@ -855,29 +523,30 @@ function buildCrawlNotice({
|
|||||||
requestedPages,
|
requestedPages,
|
||||||
actualPagesAnalyzed,
|
actualPagesAnalyzed,
|
||||||
failedPages,
|
failedPages,
|
||||||
crawlTargetSummary,
|
discoveredInternalPages,
|
||||||
|
firecrawl,
|
||||||
}) {
|
}) {
|
||||||
const parts = [];
|
if (requestedPages <= 1) {
|
||||||
|
return null;
|
||||||
if (requestedPages > 1) {
|
|
||||||
parts.push(
|
|
||||||
`The crawl analyzed ${actualPagesAnalyzed} of ${requestedPages} requested page${requestedPages === 1 ? '' : 's'}.`,
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (actualPagesAnalyzed < requestedPages) {
|
const parts = [
|
||||||
parts.push('Fewer matching crawlable pages were found than requested.');
|
`Advanced crawl analyzed ${actualPagesAnalyzed} of ${requestedPages} requested page${requestedPages === 1 ? '' : 's'}.`,
|
||||||
}
|
];
|
||||||
|
|
||||||
if ((crawlTargetSummary?.includeTargets || []).length > 0 || (crawlTargetSummary?.excludeTargets || []).length > 0) {
|
if (discoveredInternalPages + 1 < requestedPages) {
|
||||||
parts.push('Custom include/exclude targeting was applied to this report.');
|
parts.push('Fewer crawlable internal HTML pages were discovered than requested.');
|
||||||
}
|
}
|
||||||
|
|
||||||
if (failedPages > 0) {
|
if (failedPages > 0) {
|
||||||
parts.push(`${failedPages} page${failedPages === 1 ? '' : 's'} could not be fetched during the crawl.`);
|
parts.push(`${failedPages} page${failedPages === 1 ? '' : 's'} could not be fetched during the crawl.`);
|
||||||
}
|
}
|
||||||
|
|
||||||
return parts.length > 0 ? parts.join(' ') : null;
|
if (firecrawl?.message) {
|
||||||
|
parts.push(firecrawl.message);
|
||||||
|
}
|
||||||
|
|
||||||
|
return parts.join(' ');
|
||||||
}
|
}
|
||||||
|
|
||||||
function buildAggregateAnalysis({
|
function buildAggregateAnalysis({
|
||||||
@ -888,14 +557,11 @@ function buildAggregateAnalysis({
|
|||||||
discoveredInternalPages,
|
discoveredInternalPages,
|
||||||
failedPages,
|
failedPages,
|
||||||
firecrawl,
|
firecrawl,
|
||||||
crawlTargets,
|
|
||||||
provider = 'internal',
|
|
||||||
}) {
|
}) {
|
||||||
const homepage = pageAnalyses[0];
|
const homepage = pageAnalyses[0];
|
||||||
const finishedAt = new Date();
|
const finishedAt = new Date();
|
||||||
const aggregateSchema = buildAggregateSchema(pageAnalyses);
|
const aggregateSchema = buildAggregateSchema(pageAnalyses);
|
||||||
const aggregateSignals = buildAggregateSignals(pageAnalyses);
|
const aggregateSignals = buildAggregateSignals(pageAnalyses);
|
||||||
const crawlTargetSummary = summarizeCrawlTargets(crawlTargets);
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
requestedUrl: normalizedUrl,
|
requestedUrl: normalizedUrl,
|
||||||
@ -915,9 +581,7 @@ function buildAggregateAnalysis({
|
|||||||
allowedPages: entitlements.maxPagesPerCrawl,
|
allowedPages: entitlements.maxPagesPerCrawl,
|
||||||
actualPagesAnalyzed: pageAnalyses.length,
|
actualPagesAnalyzed: pageAnalyses.length,
|
||||||
advancedCrawlEnabled: entitlements.canAdvancedCrawl,
|
advancedCrawlEnabled: entitlements.canAdvancedCrawl,
|
||||||
provider,
|
provider: 'internal',
|
||||||
includeTargets: crawlTargetSummary.includeTargets,
|
|
||||||
excludeTargets: crawlTargetSummary.excludeTargets,
|
|
||||||
},
|
},
|
||||||
crawlSummary: {
|
crawlSummary: {
|
||||||
pagesWithStructuredData: pageAnalyses.filter((page) => page.schema?.hasStructuredData).length,
|
pagesWithStructuredData: pageAnalyses.filter((page) => page.schema?.hasStructuredData).length,
|
||||||
@ -943,13 +607,14 @@ function buildAggregateAnalysis({
|
|||||||
requestedPages,
|
requestedPages,
|
||||||
actualPagesAnalyzed: pageAnalyses.length,
|
actualPagesAnalyzed: pageAnalyses.length,
|
||||||
failedPages: failedPages.length,
|
failedPages: failedPages.length,
|
||||||
crawlTargetSummary,
|
discoveredInternalPages,
|
||||||
|
firecrawl,
|
||||||
}),
|
}),
|
||||||
finishedAt,
|
finishedAt,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
function buildFailureAnalysis(normalizedUrl, error, firecrawl, provider = 'internal') {
|
function buildFailureAnalysis(normalizedUrl, error, firecrawl) {
|
||||||
const isAxiosError = axios.isAxiosError(error);
|
const isAxiosError = axios.isAxiosError(error);
|
||||||
|
|
||||||
return {
|
return {
|
||||||
@ -968,9 +633,6 @@ function buildFailureAnalysis(normalizedUrl, error, firecrawl, provider = 'inter
|
|||||||
rdfa: { count: 0, detected: false },
|
rdfa: { count: 0, detected: false },
|
||||||
},
|
},
|
||||||
firecrawl,
|
firecrawl,
|
||||||
crawlPlan: {
|
|
||||||
provider,
|
|
||||||
},
|
|
||||||
error: isAxiosError
|
error: isAxiosError
|
||||||
? error.response
|
? error.response
|
||||||
? `Request failed with status ${error.response.status}`
|
? `Request failed with status ${error.response.status}`
|
||||||
@ -1461,8 +1123,7 @@ module.exports = class SitesService {
|
|||||||
const requestedPages = parseRequestedPages(data?.requestedPages);
|
const requestedPages = parseRequestedPages(data?.requestedPages);
|
||||||
const entitlements = ensureRequestedPagesAllowed(requestedPages, currentUser);
|
const entitlements = ensureRequestedPagesAllowed(requestedPages, currentUser);
|
||||||
const normalizedUrl = normalizeUrl(data?.url || data?.base_url);
|
const normalizedUrl = normalizeUrl(data?.url || data?.base_url);
|
||||||
const crawlTargets = normalizeCrawlTargets(data, normalizedUrl);
|
const firecrawl = getFirecrawlScaffold({ requestedPages, entitlements });
|
||||||
let firecrawl = getFirecrawlScaffold({ requestedPages, entitlements });
|
|
||||||
const requestedName =
|
const requestedName =
|
||||||
typeof data?.name === 'string' && data.name.trim()
|
typeof data?.name === 'string' && data.name.trim()
|
||||||
? data.name.trim()
|
? data.name.trim()
|
||||||
@ -1529,50 +1190,12 @@ module.exports = class SitesService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
let crawlResult;
|
const crawlResult = await crawlPages(normalizedUrl, requestedPages);
|
||||||
|
|
||||||
if (firecrawl.shouldUseFirecrawl) {
|
|
||||||
try {
|
|
||||||
crawlResult = await crawlPagesWithFirecrawl(normalizedUrl, requestedPages, crawlTargets);
|
|
||||||
firecrawl = {
|
|
||||||
...firecrawl,
|
|
||||||
currentProvider: 'firecrawl',
|
|
||||||
crawlId: crawlResult.firecrawlJob?.crawlId || null,
|
|
||||||
crawlStatus: crawlResult.firecrawlJob?.status || null,
|
|
||||||
creditsUsed: crawlResult.firecrawlJob?.creditsUsed || 0,
|
|
||||||
message: crawlResult.firecrawlJob?.status === 'failed'
|
|
||||||
? 'Firecrawl ran for this paid request, but the crawl reported failures. Partial results are shown when available.'
|
|
||||||
: 'Firecrawl handled this paid request with sitemap-aware, JavaScript-rendered crawling.',
|
|
||||||
};
|
|
||||||
} catch (error) {
|
|
||||||
console.error('Firecrawl crawl failed, falling back to internal crawl:', error);
|
|
||||||
firecrawl = {
|
|
||||||
...firecrawl,
|
|
||||||
currentProvider: 'internal',
|
|
||||||
status: 'fallback_internal_after_error',
|
|
||||||
shouldUseFirecrawl: false,
|
|
||||||
fallbackReason: error.message,
|
|
||||||
message: `Firecrawl was selected for this paid request but failed to run (${error.message}). The analyzer fell back to the built-in crawler.`,
|
|
||||||
};
|
|
||||||
crawlResult = await crawlPages(normalizedUrl, requestedPages, crawlTargets);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
crawlResult = await crawlPages(normalizedUrl, requestedPages, crawlTargets);
|
|
||||||
firecrawl = {
|
|
||||||
...firecrawl,
|
|
||||||
currentProvider: 'internal',
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
const pageAnalyses = crawlResult.pages;
|
const pageAnalyses = crawlResult.pages;
|
||||||
|
|
||||||
if (pageAnalyses.length === 0) {
|
if (pageAnalyses.length === 0) {
|
||||||
const firstFailure = crawlResult.failedPages[0];
|
const firstFailure = crawlResult.failedPages[0];
|
||||||
const error = new Error(
|
const error = new Error(firstFailure?.error || 'Site analysis failed.');
|
||||||
crawlTargets.includeTargets.length > 0 || crawlTargets.excludeTargets.length > 0
|
|
||||||
? 'No pages matched the include/exclude targeting rules you entered.'
|
|
||||||
: firstFailure?.error || 'Site analysis failed.',
|
|
||||||
);
|
|
||||||
error.code = 400;
|
error.code = 400;
|
||||||
throw error;
|
throw error;
|
||||||
}
|
}
|
||||||
@ -1585,8 +1208,6 @@ module.exports = class SitesService {
|
|||||||
discoveredInternalPages: crawlResult.discoveredInternalPages,
|
discoveredInternalPages: crawlResult.discoveredInternalPages,
|
||||||
failedPages: crawlResult.failedPages,
|
failedPages: crawlResult.failedPages,
|
||||||
firecrawl,
|
firecrawl,
|
||||||
crawlTargets,
|
|
||||||
provider: crawlResult.provider || 'internal',
|
|
||||||
});
|
});
|
||||||
const homepage = pageAnalyses[0];
|
const homepage = pageAnalyses[0];
|
||||||
const finishedAt = analysis.finishedAt;
|
const finishedAt = analysis.finishedAt;
|
||||||
@ -1670,12 +1291,7 @@ module.exports = class SitesService {
|
|||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error('Site analysis failed:', error);
|
console.error('Site analysis failed:', error);
|
||||||
|
|
||||||
const failureAnalysis = buildFailureAnalysis(
|
const failureAnalysis = buildFailureAnalysis(normalizedUrl, error, firecrawl);
|
||||||
normalizedUrl,
|
|
||||||
error,
|
|
||||||
firecrawl,
|
|
||||||
firecrawl?.currentProvider || 'internal',
|
|
||||||
);
|
|
||||||
const failedAt = new Date();
|
const failedAt = new Date();
|
||||||
const failureTransaction = await db.sequelize.transaction();
|
const failureTransaction = await db.sequelize.transaction();
|
||||||
let failedSite;
|
let failedSite;
|
||||||
@ -1735,7 +1351,7 @@ module.exports = class SitesService {
|
|||||||
allowedPages: entitlements.maxPagesPerCrawl,
|
allowedPages: entitlements.maxPagesPerCrawl,
|
||||||
actualPagesAnalyzed: 0,
|
actualPagesAnalyzed: 0,
|
||||||
advancedCrawlEnabled: entitlements.canAdvancedCrawl,
|
advancedCrawlEnabled: entitlements.canAdvancedCrawl,
|
||||||
provider: failureAnalysis.crawlPlan?.provider || 'internal',
|
provider: 'internal',
|
||||||
},
|
},
|
||||||
entitlements,
|
entitlements,
|
||||||
},
|
},
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
import Image from 'next/image'
|
|
||||||
import React from 'react'
|
import React from 'react'
|
||||||
|
|
||||||
type Props = {
|
type Props = {
|
||||||
@ -7,12 +6,10 @@ type Props = {
|
|||||||
|
|
||||||
export default function Logo({ className = '' }: Props) {
|
export default function Logo({ className = '' }: Props) {
|
||||||
return (
|
return (
|
||||||
<Image
|
<img
|
||||||
src="https://flatlogic.com/logo.svg"
|
src={"https://flatlogic.com/logo.svg"}
|
||||||
className={className}
|
className={className}
|
||||||
alt="Flatlogic logo"
|
alt={'Flatlogic logo'}>
|
||||||
width={160}
|
</img>
|
||||||
height={32}
|
|
||||||
/>
|
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,6 +1,6 @@
|
|||||||
import { hasPermission } from './userPermissions';
|
import { hasPermission } from './userPermissions';
|
||||||
|
|
||||||
export const BASIC_MAX_PAGES_PER_CRAWL = 25;
|
export const BASIC_MAX_PAGES_PER_CRAWL = 1;
|
||||||
export const ADVANCED_MAX_PAGES_PER_CRAWL = 25;
|
export const ADVANCED_MAX_PAGES_PER_CRAWL = 25;
|
||||||
export const ADVANCED_CRAWL_PERMISSION = 'USE_ADVANCED_CRAWL';
|
export const ADVANCED_CRAWL_PERMISSION = 'USE_ADVANCED_CRAWL';
|
||||||
export const PLATFORM_OUTPUT_PERMISSION = 'USE_PLATFORM_OUTPUT';
|
export const PLATFORM_OUTPUT_PERMISSION = 'USE_PLATFORM_OUTPUT';
|
||||||
|
|||||||
@ -3,7 +3,6 @@ import {
|
|||||||
mdiUpload,
|
mdiUpload,
|
||||||
} from '@mdi/js';
|
} from '@mdi/js';
|
||||||
import Head from 'next/head';
|
import Head from 'next/head';
|
||||||
import Image from 'next/image';
|
|
||||||
import React, { ReactElement, useEffect, useState } from 'react';
|
import React, { ReactElement, useEffect, useState } from 'react';
|
||||||
import { ToastContainer, toast } from 'react-toastify';
|
import { ToastContainer, toast } from 'react-toastify';
|
||||||
import DatePicker from 'react-datepicker';
|
import DatePicker from 'react-datepicker';
|
||||||
@ -85,13 +84,7 @@ const EditUsers = () => {
|
|||||||
<CardBox>
|
<CardBox>
|
||||||
{currentUser?.avatar[0]?.publicUrl && <div className={'grid grid-cols-6 gap-4 mb-4'}>
|
{currentUser?.avatar[0]?.publicUrl && <div className={'grid grid-cols-6 gap-4 mb-4'}>
|
||||||
<div className="col-span-1 w-80 h-80 overflow-hidden border-2 rounded-full inline-flex items-center justify-center mb-8">
|
<div className="col-span-1 w-80 h-80 overflow-hidden border-2 rounded-full inline-flex items-center justify-center mb-8">
|
||||||
<Image
|
<img className="w-80 h-80 max-w-full max-h-full object-cover object-center" src={`${currentUser?.avatar[0]?.publicUrl}`} alt="Avatar" />
|
||||||
className="w-80 h-80 max-w-full max-h-full object-cover object-center"
|
|
||||||
src={`${currentUser?.avatar[0]?.publicUrl}`}
|
|
||||||
alt="Avatar"
|
|
||||||
width={320}
|
|
||||||
height={320}
|
|
||||||
/>
|
|
||||||
</div>
|
</div>
|
||||||
</div>}
|
</div>}
|
||||||
<Formik
|
<Formik
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user