Autosave: 20260414-175631

This commit is contained in:
Flatlogic Bot 2026-04-14 17:56:29 +00:00
parent fa68f426aa
commit 48292888fb
6 changed files with 1482 additions and 505 deletions

View File

@ -12,3 +12,6 @@ EMAIL_USER=AKIAVEW7G4PQUBGM52OF
EMAIL_PASS=BLnD4hKGb6YkSz3gaQrf8fnyLi3C3/EdjOOsLEDTDPTz
SECRET_KEY=HUEyqESqgQ1yTwzVlO6wprC9Kf1J1xuA
PEXELS_KEY=Vc99rnmOhHhJAbgGQoKLZtsaIVfkeownoQNbTj78VemUjKh08ZYRbf18
FIRECRAWL_API_KEY=fc-409763513f6c458c9d1d09e460346b17
FIRECRAWL_BASE_URL=https://api.firecrawl.dev/v2
FIRECRAWL_ENABLED=true

View File

@ -1,4 +1,49 @@
const FIRECRAWL_DEFAULT_BASE_URL = 'https://api.firecrawl.dev/v1';
const fs = require('fs');
const path = require('path');
const axios = require('axios');
const FIRECRAWL_DEFAULT_BASE_URL = 'https://api.firecrawl.dev/v2';
const FIRECRAWL_DEFAULT_POLL_INTERVAL_MS = 2000;
const FIRECRAWL_DEFAULT_TIMEOUT_MS = 45000;
const BACKEND_ENV_PATH = path.join(__dirname, '..', '..', '.env');
function readBackendEnvFile() {
try {
const raw = fs.readFileSync(BACKEND_ENV_PATH, 'utf8');
return raw.split(/\r?\n/).reduce((accumulator, line) => {
const trimmedLine = line.trim();
if (!trimmedLine || trimmedLine.startsWith('#')) {
return accumulator;
}
const separatorIndex = trimmedLine.indexOf('=');
if (separatorIndex === -1) {
return accumulator;
}
const key = trimmedLine.slice(0, separatorIndex).trim();
const value = trimmedLine.slice(separatorIndex + 1).trim();
accumulator[key] = value.replace(/^"|"$/g, '').replace(/^'|'$/g, '');
return accumulator;
}, {});
} catch (error) {
return {};
}
}
function getEnvValue(name) {
if (process.env[name] !== undefined && process.env[name] !== null && process.env[name] !== '') {
return process.env[name];
}
return readBackendEnvFile()[name];
}
function toBoolean(value, defaultValue = false) {
if (value === undefined || value === null || value === '') {
@ -22,12 +67,34 @@ function toBoolean(value, defaultValue = false) {
return defaultValue;
}
function toPositiveInteger(value, defaultValue) {
const parsed = Number(value);
if (Number.isInteger(parsed) && parsed > 0) {
return parsed;
}
return defaultValue;
}
function normalizeBaseUrl(baseUrl) {
return String(baseUrl || FIRECRAWL_DEFAULT_BASE_URL)
.trim()
.replace(/\/+$/, '');
}
function getFirecrawlRuntime() {
const apiKey = String(process.env.FIRECRAWL_API_KEY || '').trim();
const baseUrl = String(
process.env.FIRECRAWL_BASE_URL || FIRECRAWL_DEFAULT_BASE_URL,
).trim();
const enabled = toBoolean(process.env.FIRECRAWL_ENABLED, true);
const apiKey = String(getEnvValue('FIRECRAWL_API_KEY') || '').trim();
const baseUrl = normalizeBaseUrl(getEnvValue('FIRECRAWL_BASE_URL'));
const enabled = toBoolean(getEnvValue('FIRECRAWL_ENABLED'), true);
const pollIntervalMs = toPositiveInteger(
getEnvValue('FIRECRAWL_POLL_INTERVAL_MS'),
FIRECRAWL_DEFAULT_POLL_INTERVAL_MS,
);
const timeoutMs = toPositiveInteger(
getEnvValue('FIRECRAWL_TIMEOUT_MS'),
FIRECRAWL_DEFAULT_TIMEOUT_MS,
);
return {
provider: 'firecrawl',
@ -35,29 +102,255 @@ function getFirecrawlRuntime() {
enabled,
configured: Boolean(apiKey),
hasApiKey: Boolean(apiKey),
mode: 'scaffold_only',
apiKey,
pollIntervalMs,
timeoutMs,
mode: enabled && apiKey ? 'active' : 'scaffold_only',
};
}
function buildFirecrawlMessage(runtime, entitlements, requestedPages) {
if (!entitlements?.canAdvancedCrawl) {
return 'Firecrawl is reserved for paid Advanced Crawl users. This request will stay on the built-in crawler.';
}
if (!runtime.enabled) {
return 'Firecrawl is configured in code, but FIRECRAWL_ENABLED is turned off. Paid users will stay on the built-in crawler until it is enabled.';
}
if (!runtime.configured) {
return 'Firecrawl is enabled for paid users, but FIRECRAWL_API_KEY is missing. Falling back to the built-in crawler until the key is configured.';
}
return requestedPages > 1
? 'Paid Advanced Crawl users are routed through Firecrawl for sitemap-aware, JavaScript-rendered multi-page crawling.'
: 'Paid Advanced Crawl users are routed through Firecrawl for sitemap-aware, JavaScript-rendered crawling.';
}
function getFirecrawlScaffold({ requestedPages, entitlements } = {}) {
const runtime = getFirecrawlRuntime();
const wantsAdvancedCrawl = Number(requestedPages || 1) > 1;
const advancedCrawlUnlocked = Boolean(entitlements?.canAdvancedCrawl);
const shouldUseFirecrawlLater = runtime.enabled && (wantsAdvancedCrawl || advancedCrawlUnlocked);
const availableForCurrentUser = Boolean(entitlements?.canAdvancedCrawl);
const shouldUseFirecrawl = Boolean(
availableForCurrentUser
&& runtime.enabled
&& runtime.configured,
);
return {
...runtime,
status: runtime.configured ? 'ready_for_activation' : 'awaiting_api_key',
provider: 'firecrawl',
baseUrl: runtime.baseUrl,
enabled: runtime.enabled,
configured: runtime.configured,
hasApiKey: runtime.hasApiKey,
mode: shouldUseFirecrawl ? 'active' : runtime.mode,
status: shouldUseFirecrawl ? 'active_for_paid_users' : 'scaffold_only',
wouldHandleJavascript: true,
wouldHandleSitemapDiscovery: true,
shouldUseFirecrawlLater,
message: runtime.configured
? 'Firecrawl scaffold is wired and ready for the next activation step, but this analyzer still uses the built-in crawler today.'
: 'Firecrawl scaffold is wired, but FIRECRAWL_API_KEY is not set yet. The analyzer still uses the built-in crawler for now.',
availableForCurrentUser,
shouldUseFirecrawl,
usePaidOnly: true,
message: buildFirecrawlMessage(runtime, entitlements, requestedPages),
};
}
function sleep(milliseconds) {
return new Promise((resolve) => {
setTimeout(resolve, milliseconds);
});
}
function isAbsoluteUrl(value) {
return /^https?:\/\//i.test(String(value || ''));
}
function buildApiUrl(runtime, pathOrUrl) {
const value = String(pathOrUrl || '').trim();
if (!value) {
return runtime.baseUrl;
}
if (isAbsoluteUrl(value)) {
return value;
}
if (value.startsWith('/')) {
return `${runtime.baseUrl}${value}`;
}
return `${runtime.baseUrl}/${value}`;
}
function summarizeFirecrawlPayload(payload) {
if (!payload) {
return 'Unknown Firecrawl API error.';
}
if (typeof payload === 'string') {
return payload;
}
if (typeof payload?.error === 'string' && payload.error.trim()) {
return payload.error;
}
if (typeof payload?.message === 'string' && payload.message.trim()) {
return payload.message;
}
return 'Unexpected Firecrawl API response.';
}
async function firecrawlRequest(runtime, method, pathOrUrl, options = {}) {
try {
const response = await axios({
method,
url: buildApiUrl(runtime, pathOrUrl),
timeout: options.timeout || runtime.timeoutMs,
data: options.data,
headers: {
Authorization: `Bearer ${runtime.apiKey}`,
'Content-Type': 'application/json',
...(options.headers || {}),
},
});
return response.data;
} catch (error) {
if (axios.isAxiosError(error)) {
const payload = error.response?.data;
const detail = summarizeFirecrawlPayload(payload);
const status = error.response?.status;
const wrappedError = new Error(
status
? `Firecrawl request failed with status ${status}: ${detail}`
: `Firecrawl request failed: ${detail}`,
);
wrappedError.code = status || 502;
wrappedError.response = payload;
throw wrappedError;
}
throw error;
}
}
async function collectPagedStatus(runtime, initialStatus) {
const documents = Array.isArray(initialStatus?.data)
? [...initialStatus.data]
: [];
let nextUrl = initialStatus?.next || null;
while (nextUrl) {
const nextStatus = await firecrawlRequest(runtime, 'get', nextUrl);
if (Array.isArray(nextStatus?.data) && nextStatus.data.length > 0) {
documents.push(...nextStatus.data);
}
nextUrl = nextStatus?.next || null;
}
return {
...initialStatus,
data: documents,
next: null,
};
}
async function waitForCrawlCompletion(runtime, crawlId) {
const deadline = Date.now() + runtime.timeoutMs;
while (Date.now() <= deadline) {
const status = await firecrawlRequest(runtime, 'get', `/crawl/${encodeURIComponent(crawlId)}`);
if (status?.status === 'completed' || status?.status === 'failed') {
return collectPagedStatus(runtime, status);
}
await sleep(runtime.pollIntervalMs);
}
const timeoutError = new Error(
`Firecrawl crawl timed out after ${Math.round(runtime.timeoutMs / 1000)} seconds.`,
);
timeoutError.code = 504;
throw timeoutError;
}
async function getCrawlErrors(runtime, crawlId) {
try {
return await firecrawlRequest(runtime, 'get', `/crawl/${encodeURIComponent(crawlId)}/errors`);
} catch (error) {
console.error('Failed to fetch Firecrawl crawl errors:', error);
return {
errors: [],
robotsBlocked: [],
};
}
}
async function crawlSiteWithFirecrawl(url, requestedPages) {
const runtime = getFirecrawlRuntime();
if (!runtime.enabled) {
const error = new Error('Firecrawl is disabled in this environment.');
error.code = 503;
throw error;
}
if (!runtime.configured) {
const error = new Error('Firecrawl API key is not configured.');
error.code = 503;
throw error;
}
const started = await firecrawlRequest(runtime, 'post', '/crawl', {
data: {
url,
limit: requestedPages,
sitemap: 'include',
crawlEntireDomain: true,
allowExternalLinks: false,
allowSubdomains: false,
ignoreQueryParameters: true,
scrapeOptions: {
formats: ['html'],
},
},
});
const crawlId = started?.id;
if (!crawlId) {
const error = new Error('Firecrawl did not return a crawl job ID.');
error.code = 502;
error.response = started;
throw error;
}
const status = await waitForCrawlCompletion(runtime, crawlId);
const crawlErrors = await getCrawlErrors(runtime, crawlId);
return {
crawlId,
provider: 'firecrawl',
status: status?.status || 'unknown',
total: status?.total || 0,
completed: status?.completed || 0,
creditsUsed: status?.creditsUsed || 0,
expiresAt: status?.expiresAt || null,
data: Array.isArray(status?.data) ? status.data : [],
errors: Array.isArray(crawlErrors?.errors) ? crawlErrors.errors : [],
robotsBlocked: Array.isArray(crawlErrors?.robotsBlocked)
? crawlErrors.robotsBlocked
: [],
};
}
module.exports = {
getFirecrawlRuntime,
getFirecrawlScaffold,
crawlSiteWithFirecrawl,
};

View File

@ -1,6 +1,6 @@
const ValidationError = require('./notifications/errors/validation');
const BASIC_MAX_PAGES_PER_CRAWL = 1;
const BASIC_MAX_PAGES_PER_CRAWL = 25;
const ADVANCED_MAX_PAGES_PER_CRAWL = 25;
const ADVANCED_CRAWL_PERMISSION = 'USE_ADVANCED_CRAWL';
const PLATFORM_OUTPUT_PERMISSION = 'USE_PLATFORM_OUTPUT';
@ -68,7 +68,7 @@ function ensureRequestedPagesAllowed(requestedPages, currentUser) {
if (requestedPages > entitlements.maxPagesPerCrawl) {
const error = new Error(
`Your current plan allows up to ${entitlements.maxPagesPerCrawl} page${entitlements.maxPagesPerCrawl === 1 ? '' : 's'} per crawl. Upgrade to Advanced Crawl to analyze ${requestedPages} pages.`,
`This analyzer supports up to ${entitlements.maxPagesPerCrawl} page${entitlements.maxPagesPerCrawl === 1 ? '' : 's'} per crawl. Reduce the requested page count to continue.`,
);
error.code = 403;
throw error;

View File

@ -8,7 +8,7 @@ const {
ensureRequestedPagesAllowed,
ensurePlatformOutputAllowed,
} = require('./siteEntitlements');
const { getFirecrawlScaffold } = require('./firecrawl');
const { getFirecrawlScaffold, crawlSiteWithFirecrawl } = require('./firecrawl');
const REQUEST_TIMEOUT = 15000;
const PREVIEW_LIMIT = 5;
@ -271,6 +271,152 @@ function normalizeAllowedHostnames(allowedHostnames) {
return new Set();
}
function normalizeTargetPathname(pathname) {
const trimmedPathname = String(pathname || '').trim();
if (!trimmedPathname || trimmedPathname === '/') {
return '/';
}
return `/${trimmedPathname.replace(/^\/+/, '').replace(/\/+$/, '')}`;
}
function buildCrawlTarget(rawTarget, baseUrl, label) {
const trimmedTarget = String(rawTarget || '').trim();
if (!trimmedTarget) {
return null;
}
let parsedTarget;
try {
if (/^https?:\/\//i.test(trimmedTarget)) {
parsedTarget = new URL(trimmedTarget);
} else if (trimmedTarget.startsWith('/')) {
parsedTarget = new URL(trimmedTarget, baseUrl);
} else {
parsedTarget = new URL(`/${trimmedTarget.replace(/^\/+/, '')}`, baseUrl);
}
} catch (error) {
const targetError = new Error(`Invalid ${label} target: ${trimmedTarget}`);
targetError.code = 400;
throw targetError;
}
if (!['http:', 'https:'].includes(parsedTarget.protocol)) {
const targetError = new Error(`Invalid ${label} target: ${trimmedTarget}`);
targetError.code = 400;
throw targetError;
}
const baseHostname = new URL(baseUrl).hostname.toLowerCase();
if (parsedTarget.hostname.toLowerCase() !== baseHostname) {
const targetError = new Error(
`${label} targets must stay on the same website as the analyzed URL.`,
);
targetError.code = 400;
throw targetError;
}
parsedTarget.hash = '';
parsedTarget.search = '';
const path = normalizeTargetPathname(parsedTarget.pathname);
const url = normalizeUrl(parsedTarget.toString());
return {
input: trimmedTarget,
label: /^https?:\/\//i.test(trimmedTarget) ? url : path,
path,
url,
};
}
function parseCrawlTargets(rawTargets, baseUrl, label) {
const targetValues = Array.isArray(rawTargets)
? rawTargets
: String(rawTargets || '').split(/\r?\n/);
const dedupedTargets = new Map();
targetValues
.map((targetValue) => String(targetValue || '').trim())
.filter(Boolean)
.forEach((targetValue) => {
const normalizedTarget = buildCrawlTarget(targetValue, baseUrl, label);
dedupedTargets.set(normalizedTarget.url, normalizedTarget);
});
return Array.from(dedupedTargets.values());
}
function normalizeCrawlTargets(data, baseUrl) {
return {
includeTargets: parseCrawlTargets(data?.includeTargets, baseUrl, 'include'),
excludeTargets: parseCrawlTargets(data?.excludeTargets, baseUrl, 'exclude'),
};
}
function isUrlMatchingTarget(candidateUrl, target) {
if (!candidateUrl || !target?.path) {
return false;
}
let parsedUrl;
try {
parsedUrl = new URL(normalizeUrl(candidateUrl));
} catch (error) {
return false;
}
const candidatePath = normalizeTargetPathname(parsedUrl.pathname);
if (target.path === '/') {
return true;
}
return candidatePath === target.path || candidatePath.startsWith(`${target.path}/`);
}
function matchesAnyCrawlTarget(candidateUrl, targets = []) {
return targets.some((target) => isUrlMatchingTarget(candidateUrl, target));
}
function isUrlAllowedByCrawlTargets(candidateUrl, crawlTargets = {}) {
const includeTargets = crawlTargets.includeTargets || [];
const excludeTargets = crawlTargets.excludeTargets || [];
if (includeTargets.length > 0 && !matchesAnyCrawlTarget(candidateUrl, includeTargets)) {
return false;
}
if (excludeTargets.length > 0 && matchesAnyCrawlTarget(candidateUrl, excludeTargets)) {
return false;
}
return true;
}
function buildSeedUrls(baseUrl, crawlTargets = {}) {
const seedUrls = new Set([baseUrl]);
(crawlTargets.includeTargets || []).forEach((target) => {
seedUrls.add(target.url);
});
return Array.from(seedUrls);
}
function summarizeCrawlTargets(crawlTargets = {}) {
return {
includeTargets: (crawlTargets.includeTargets || []).map((target) => target.label),
excludeTargets: (crawlTargets.excludeTargets || []).map((target) => target.label),
};
}
function normalizeCrawlUrl(rawUrl, parentUrl, allowedHostnames) {
if (!rawUrl || typeof rawUrl !== 'string') {
return null;
@ -403,12 +549,184 @@ async function fetchAnalyzedPage(pageUrl, allowedHostnames) {
};
}
async function crawlPages(baseUrl, requestedPages) {
function analyzeFetchedPage({
requestedUrl,
analyzedUrl,
html,
statusCode,
headers = {},
allowedHostnames,
discoveredLinks = null,
pageTitle = null,
}) {
const normalizedAnalyzedUrl = normalizeUrl(analyzedUrl || requestedUrl);
const normalizedAllowedHostnames = normalizeAllowedHostnames(allowedHostnames);
const analyzedHostname = new URL(normalizedAnalyzedUrl).hostname.toLowerCase();
normalizedAllowedHostnames.add(analyzedHostname);
if (allowedHostnames instanceof Set) {
allowedHostnames.add(analyzedHostname);
}
const resolvedHtml = typeof html === 'string' ? html : '';
const resolvedPageTitle = pageTitle || extractPageTitle(resolvedHtml);
const platform = detectPlatform(resolvedHtml, headers, normalizedAnalyzedUrl);
const schema = extractSchemaSummary(resolvedHtml);
const pageSignals = inferPageSignals(
resolvedHtml,
normalizedAnalyzedUrl,
resolvedPageTitle,
platform,
);
const normalizedLinks = Array.isArray(discoveredLinks)
? Array.from(
new Set(
discoveredLinks
.map((linkUrl) => normalizeCrawlUrl(linkUrl, normalizedAnalyzedUrl, normalizedAllowedHostnames))
.filter(Boolean),
),
)
: extractInternalLinks(
resolvedHtml,
normalizedAnalyzedUrl,
normalizedAllowedHostnames,
);
return {
requestedUrl: requestedUrl || normalizedAnalyzedUrl,
analyzedUrl: normalizedAnalyzedUrl,
pageTitle: resolvedPageTitle,
statusCode: statusCode || null,
html: resolvedHtml,
platform,
schema,
pageSignals,
discoveredLinks: normalizedLinks,
};
}
function transformFirecrawlDocument(document, allowedHostnames) {
const metadata = document?.metadata || {};
const sourceUrl =
metadata.sourceURL
|| metadata.sourceUrl
|| metadata.url
|| document?.url
|| document?.sourceURL
|| document?.sourceUrl;
if (!sourceUrl) {
return null;
}
const html =
typeof document?.html === 'string'
? document.html
: typeof document?.rawHtml === 'string'
? document.rawHtml
: typeof document?.content === 'string'
? document.content
: '';
return analyzeFetchedPage({
requestedUrl: sourceUrl,
analyzedUrl: sourceUrl,
html,
statusCode: document?.metadata?.statusCode || 200,
headers: {},
allowedHostnames,
discoveredLinks: Array.isArray(document?.links) ? document.links : null,
pageTitle: metadata.title || null,
});
}
async function crawlPagesWithFirecrawl(baseUrl, requestedPages, crawlTargets = {}) {
const normalizedBaseUrl = normalizeUrl(baseUrl);
const allowedHostnames = new Set([new URL(normalizedBaseUrl).hostname.toLowerCase()]);
const firecrawlResult = await crawlSiteWithFirecrawl(normalizedBaseUrl, requestedPages);
const pages = [];
const analyzedUrls = new Set();
(firecrawlResult.data || []).forEach((document) => {
try {
const page = transformFirecrawlDocument(document, allowedHostnames);
if (
!page
|| analyzedUrls.has(page.analyzedUrl)
|| !isUrlAllowedByCrawlTargets(page.analyzedUrl, crawlTargets)
) {
return;
}
analyzedUrls.add(page.analyzedUrl);
pages.push(page);
} catch (error) {
console.error('Failed to transform Firecrawl document:', error);
}
});
const failedPages = [];
(firecrawlResult.errors || []).forEach((entry) => {
const failedUrl = normalizeCrawlUrl(
entry?.path || entry?.url || entry?.sourceURL || normalizedBaseUrl,
normalizedBaseUrl,
allowedHostnames,
) || normalizedBaseUrl;
if (!isUrlAllowedByCrawlTargets(failedUrl, crawlTargets)) {
return;
}
failedPages.push({
url: failedUrl,
error: entry?.error || entry?.message || 'Firecrawl could not fetch this page.',
});
});
(firecrawlResult.robotsBlocked || []).forEach((entry) => {
const blockedUrl = normalizeCrawlUrl(
entry?.path || entry?.url || normalizedBaseUrl,
normalizedBaseUrl,
allowedHostnames,
) || normalizedBaseUrl;
if (!isUrlAllowedByCrawlTargets(blockedUrl, crawlTargets)) {
return;
}
failedPages.push({
url: blockedUrl,
error: 'Blocked by robots.txt during Firecrawl crawl.',
});
});
return {
provider: 'firecrawl',
pages,
failedPages,
discoveredInternalPages: Math.max((firecrawlResult.total || pages.length) - 1, 0),
firecrawlJob: {
crawlId: firecrawlResult.crawlId,
status: firecrawlResult.status,
total: firecrawlResult.total,
completed: firecrawlResult.completed,
creditsUsed: firecrawlResult.creditsUsed,
expiresAt: firecrawlResult.expiresAt,
failedPages: failedPages.length,
},
};
}
async function crawlPages(baseUrl, requestedPages, crawlTargets = {}) {
const normalizedBaseUrl = normalizeUrl(baseUrl);
const allowedHostnames = new Set([new URL(normalizedBaseUrl).hostname.toLowerCase()]);
const seedUrls = buildSeedUrls(normalizedBaseUrl, crawlTargets);
const seedUrlSet = new Set(seedUrls);
const visitedUrls = new Set();
const queuedUrls = new Set([normalizedBaseUrl]);
const pendingUrls = [normalizedBaseUrl];
const queuedUrls = new Set(seedUrls);
const pendingUrls = [...seedUrls];
const pages = [];
const failedPages = [];
let discoveredInternalPages = 0;
@ -420,15 +738,29 @@ async function crawlPages(baseUrl, requestedPages) {
continue;
}
const isBootstrapSeed = seedUrlSet.has(nextUrl) && nextUrl === normalizedBaseUrl;
if (!isBootstrapSeed && !isUrlAllowedByCrawlTargets(nextUrl, crawlTargets)) {
visitedUrls.add(nextUrl);
continue;
}
visitedUrls.add(nextUrl);
try {
const page = await fetchAnalyzedPage(nextUrl, allowedHostnames);
visitedUrls.add(page.analyzedUrl);
queuedUrls.add(page.analyzedUrl);
pages.push(page);
if (isUrlAllowedByCrawlTargets(page.analyzedUrl, crawlTargets)) {
pages.push(page);
}
page.discoveredLinks.forEach((linkUrl) => {
if (!isUrlAllowedByCrawlTargets(linkUrl, crawlTargets)) {
return;
}
if (!visitedUrls.has(linkUrl) && !queuedUrls.has(linkUrl)) {
queuedUrls.add(linkUrl);
pendingUrls.push(linkUrl);
@ -523,30 +855,29 @@ function buildCrawlNotice({
requestedPages,
actualPagesAnalyzed,
failedPages,
discoveredInternalPages,
firecrawl,
crawlTargetSummary,
}) {
if (requestedPages <= 1) {
return null;
const parts = [];
if (requestedPages > 1) {
parts.push(
`The crawl analyzed ${actualPagesAnalyzed} of ${requestedPages} requested page${requestedPages === 1 ? '' : 's'}.`,
);
}
const parts = [
`Advanced crawl analyzed ${actualPagesAnalyzed} of ${requestedPages} requested page${requestedPages === 1 ? '' : 's'}.`,
];
if (actualPagesAnalyzed < requestedPages) {
parts.push('Fewer matching crawlable pages were found than requested.');
}
if (discoveredInternalPages + 1 < requestedPages) {
parts.push('Fewer crawlable internal HTML pages were discovered than requested.');
if ((crawlTargetSummary?.includeTargets || []).length > 0 || (crawlTargetSummary?.excludeTargets || []).length > 0) {
parts.push('Custom include/exclude targeting was applied to this report.');
}
if (failedPages > 0) {
parts.push(`${failedPages} page${failedPages === 1 ? '' : 's'} could not be fetched during the crawl.`);
}
if (firecrawl?.message) {
parts.push(firecrawl.message);
}
return parts.join(' ');
return parts.length > 0 ? parts.join(' ') : null;
}
function buildAggregateAnalysis({
@ -557,11 +888,14 @@ function buildAggregateAnalysis({
discoveredInternalPages,
failedPages,
firecrawl,
crawlTargets,
provider = 'internal',
}) {
const homepage = pageAnalyses[0];
const finishedAt = new Date();
const aggregateSchema = buildAggregateSchema(pageAnalyses);
const aggregateSignals = buildAggregateSignals(pageAnalyses);
const crawlTargetSummary = summarizeCrawlTargets(crawlTargets);
return {
requestedUrl: normalizedUrl,
@ -581,7 +915,9 @@ function buildAggregateAnalysis({
allowedPages: entitlements.maxPagesPerCrawl,
actualPagesAnalyzed: pageAnalyses.length,
advancedCrawlEnabled: entitlements.canAdvancedCrawl,
provider: 'internal',
provider,
includeTargets: crawlTargetSummary.includeTargets,
excludeTargets: crawlTargetSummary.excludeTargets,
},
crawlSummary: {
pagesWithStructuredData: pageAnalyses.filter((page) => page.schema?.hasStructuredData).length,
@ -607,14 +943,13 @@ function buildAggregateAnalysis({
requestedPages,
actualPagesAnalyzed: pageAnalyses.length,
failedPages: failedPages.length,
discoveredInternalPages,
firecrawl,
crawlTargetSummary,
}),
finishedAt,
};
}
function buildFailureAnalysis(normalizedUrl, error, firecrawl) {
function buildFailureAnalysis(normalizedUrl, error, firecrawl, provider = 'internal') {
const isAxiosError = axios.isAxiosError(error);
return {
@ -633,6 +968,9 @@ function buildFailureAnalysis(normalizedUrl, error, firecrawl) {
rdfa: { count: 0, detected: false },
},
firecrawl,
crawlPlan: {
provider,
},
error: isAxiosError
? error.response
? `Request failed with status ${error.response.status}`
@ -1123,7 +1461,8 @@ module.exports = class SitesService {
const requestedPages = parseRequestedPages(data?.requestedPages);
const entitlements = ensureRequestedPagesAllowed(requestedPages, currentUser);
const normalizedUrl = normalizeUrl(data?.url || data?.base_url);
const firecrawl = getFirecrawlScaffold({ requestedPages, entitlements });
const crawlTargets = normalizeCrawlTargets(data, normalizedUrl);
let firecrawl = getFirecrawlScaffold({ requestedPages, entitlements });
const requestedName =
typeof data?.name === 'string' && data.name.trim()
? data.name.trim()
@ -1190,12 +1529,50 @@ module.exports = class SitesService {
}
try {
const crawlResult = await crawlPages(normalizedUrl, requestedPages);
let crawlResult;
if (firecrawl.shouldUseFirecrawl) {
try {
crawlResult = await crawlPagesWithFirecrawl(normalizedUrl, requestedPages, crawlTargets);
firecrawl = {
...firecrawl,
currentProvider: 'firecrawl',
crawlId: crawlResult.firecrawlJob?.crawlId || null,
crawlStatus: crawlResult.firecrawlJob?.status || null,
creditsUsed: crawlResult.firecrawlJob?.creditsUsed || 0,
message: crawlResult.firecrawlJob?.status === 'failed'
? 'Firecrawl ran for this paid request, but the crawl reported failures. Partial results are shown when available.'
: 'Firecrawl handled this paid request with sitemap-aware, JavaScript-rendered crawling.',
};
} catch (error) {
console.error('Firecrawl crawl failed, falling back to internal crawl:', error);
firecrawl = {
...firecrawl,
currentProvider: 'internal',
status: 'fallback_internal_after_error',
shouldUseFirecrawl: false,
fallbackReason: error.message,
message: `Firecrawl was selected for this paid request but failed to run (${error.message}). The analyzer fell back to the built-in crawler.`,
};
crawlResult = await crawlPages(normalizedUrl, requestedPages, crawlTargets);
}
} else {
crawlResult = await crawlPages(normalizedUrl, requestedPages, crawlTargets);
firecrawl = {
...firecrawl,
currentProvider: 'internal',
};
}
const pageAnalyses = crawlResult.pages;
if (pageAnalyses.length === 0) {
const firstFailure = crawlResult.failedPages[0];
const error = new Error(firstFailure?.error || 'Site analysis failed.');
const error = new Error(
crawlTargets.includeTargets.length > 0 || crawlTargets.excludeTargets.length > 0
? 'No pages matched the include/exclude targeting rules you entered.'
: firstFailure?.error || 'Site analysis failed.',
);
error.code = 400;
throw error;
}
@ -1208,6 +1585,8 @@ module.exports = class SitesService {
discoveredInternalPages: crawlResult.discoveredInternalPages,
failedPages: crawlResult.failedPages,
firecrawl,
crawlTargets,
provider: crawlResult.provider || 'internal',
});
const homepage = pageAnalyses[0];
const finishedAt = analysis.finishedAt;
@ -1291,7 +1670,12 @@ module.exports = class SitesService {
} catch (error) {
console.error('Site analysis failed:', error);
const failureAnalysis = buildFailureAnalysis(normalizedUrl, error, firecrawl);
const failureAnalysis = buildFailureAnalysis(
normalizedUrl,
error,
firecrawl,
firecrawl?.currentProvider || 'internal',
);
const failedAt = new Date();
const failureTransaction = await db.sequelize.transaction();
let failedSite;
@ -1351,7 +1735,7 @@ module.exports = class SitesService {
allowedPages: entitlements.maxPagesPerCrawl,
actualPagesAnalyzed: 0,
advancedCrawlEnabled: entitlements.canAdvancedCrawl,
provider: 'internal',
provider: failureAnalysis.crawlPlan?.provider || 'internal',
},
entitlements,
},

View File

@ -1,6 +1,6 @@
import { hasPermission } from './userPermissions';
export const BASIC_MAX_PAGES_PER_CRAWL = 1;
export const BASIC_MAX_PAGES_PER_CRAWL = 25;
export const ADVANCED_MAX_PAGES_PER_CRAWL = 25;
export const ADVANCED_CRAWL_PERMISSION = 'USE_ADVANCED_CRAWL';
export const PLATFORM_OUTPUT_PERMISSION = 'USE_PLATFORM_OUTPUT';

File diff suppressed because it is too large Load Diff