diff --git a/backend/.env b/backend/.env index ae9b07a..c8a42c3 100644 --- a/backend/.env +++ b/backend/.env @@ -12,3 +12,6 @@ EMAIL_USER=AKIAVEW7G4PQUBGM52OF EMAIL_PASS=BLnD4hKGb6YkSz3gaQrf8fnyLi3C3/EdjOOsLEDTDPTz SECRET_KEY=HUEyqESqgQ1yTwzVlO6wprC9Kf1J1xuA PEXELS_KEY=Vc99rnmOhHhJAbgGQoKLZtsaIVfkeownoQNbTj78VemUjKh08ZYRbf18 +FIRECRAWL_API_KEY=fc-409763513f6c458c9d1d09e460346b17 +FIRECRAWL_BASE_URL=https://api.firecrawl.dev/v2 +FIRECRAWL_ENABLED=true diff --git a/backend/src/services/firecrawl.js b/backend/src/services/firecrawl.js index 8a5fe02..75e712c 100644 --- a/backend/src/services/firecrawl.js +++ b/backend/src/services/firecrawl.js @@ -1,4 +1,49 @@ -const FIRECRAWL_DEFAULT_BASE_URL = 'https://api.firecrawl.dev/v1'; +const fs = require('fs'); +const path = require('path'); +const axios = require('axios'); + +const FIRECRAWL_DEFAULT_BASE_URL = 'https://api.firecrawl.dev/v2'; +const FIRECRAWL_DEFAULT_POLL_INTERVAL_MS = 2000; +const FIRECRAWL_DEFAULT_TIMEOUT_MS = 45000; + +const BACKEND_ENV_PATH = path.join(__dirname, '..', '..', '.env'); + +function readBackendEnvFile() { + try { + const raw = fs.readFileSync(BACKEND_ENV_PATH, 'utf8'); + + return raw.split(/\r?\n/).reduce((accumulator, line) => { + const trimmedLine = line.trim(); + + if (!trimmedLine || trimmedLine.startsWith('#')) { + return accumulator; + } + + const separatorIndex = trimmedLine.indexOf('='); + + if (separatorIndex === -1) { + return accumulator; + } + + const key = trimmedLine.slice(0, separatorIndex).trim(); + const value = trimmedLine.slice(separatorIndex + 1).trim(); + + accumulator[key] = value.replace(/^"|"$/g, '').replace(/^'|'$/g, ''); + return accumulator; + }, {}); + } catch (error) { + return {}; + } +} + +function getEnvValue(name) { + if (process.env[name] !== undefined && process.env[name] !== null && process.env[name] !== '') { + return process.env[name]; + } + + return readBackendEnvFile()[name]; +} + function toBoolean(value, defaultValue = false) { if (value === undefined || value === null || value === '') { @@ -22,12 +67,34 @@ function toBoolean(value, defaultValue = false) { return defaultValue; } +function toPositiveInteger(value, defaultValue) { + const parsed = Number(value); + + if (Number.isInteger(parsed) && parsed > 0) { + return parsed; + } + + return defaultValue; +} + +function normalizeBaseUrl(baseUrl) { + return String(baseUrl || FIRECRAWL_DEFAULT_BASE_URL) + .trim() + .replace(/\/+$/, ''); +} + function getFirecrawlRuntime() { - const apiKey = String(process.env.FIRECRAWL_API_KEY || '').trim(); - const baseUrl = String( - process.env.FIRECRAWL_BASE_URL || FIRECRAWL_DEFAULT_BASE_URL, - ).trim(); - const enabled = toBoolean(process.env.FIRECRAWL_ENABLED, true); + const apiKey = String(getEnvValue('FIRECRAWL_API_KEY') || '').trim(); + const baseUrl = normalizeBaseUrl(getEnvValue('FIRECRAWL_BASE_URL')); + const enabled = toBoolean(getEnvValue('FIRECRAWL_ENABLED'), true); + const pollIntervalMs = toPositiveInteger( + getEnvValue('FIRECRAWL_POLL_INTERVAL_MS'), + FIRECRAWL_DEFAULT_POLL_INTERVAL_MS, + ); + const timeoutMs = toPositiveInteger( + getEnvValue('FIRECRAWL_TIMEOUT_MS'), + FIRECRAWL_DEFAULT_TIMEOUT_MS, + ); return { provider: 'firecrawl', @@ -35,29 +102,255 @@ function getFirecrawlRuntime() { enabled, configured: Boolean(apiKey), hasApiKey: Boolean(apiKey), - mode: 'scaffold_only', + apiKey, + pollIntervalMs, + timeoutMs, + mode: enabled && apiKey ? 'active' : 'scaffold_only', }; } +function buildFirecrawlMessage(runtime, entitlements, requestedPages) { + if (!entitlements?.canAdvancedCrawl) { + return 'Firecrawl is reserved for paid Advanced Crawl users. This request will stay on the built-in crawler.'; + } + + if (!runtime.enabled) { + return 'Firecrawl is configured in code, but FIRECRAWL_ENABLED is turned off. Paid users will stay on the built-in crawler until it is enabled.'; + } + + if (!runtime.configured) { + return 'Firecrawl is enabled for paid users, but FIRECRAWL_API_KEY is missing. Falling back to the built-in crawler until the key is configured.'; + } + + return requestedPages > 1 + ? 'Paid Advanced Crawl users are routed through Firecrawl for sitemap-aware, JavaScript-rendered multi-page crawling.' + : 'Paid Advanced Crawl users are routed through Firecrawl for sitemap-aware, JavaScript-rendered crawling.'; +} + function getFirecrawlScaffold({ requestedPages, entitlements } = {}) { const runtime = getFirecrawlRuntime(); - const wantsAdvancedCrawl = Number(requestedPages || 1) > 1; - const advancedCrawlUnlocked = Boolean(entitlements?.canAdvancedCrawl); - const shouldUseFirecrawlLater = runtime.enabled && (wantsAdvancedCrawl || advancedCrawlUnlocked); + const availableForCurrentUser = Boolean(entitlements?.canAdvancedCrawl); + const shouldUseFirecrawl = Boolean( + availableForCurrentUser + && runtime.enabled + && runtime.configured, + ); return { - ...runtime, - status: runtime.configured ? 'ready_for_activation' : 'awaiting_api_key', + provider: 'firecrawl', + baseUrl: runtime.baseUrl, + enabled: runtime.enabled, + configured: runtime.configured, + hasApiKey: runtime.hasApiKey, + mode: shouldUseFirecrawl ? 'active' : runtime.mode, + status: shouldUseFirecrawl ? 'active_for_paid_users' : 'scaffold_only', wouldHandleJavascript: true, wouldHandleSitemapDiscovery: true, - shouldUseFirecrawlLater, - message: runtime.configured - ? 'Firecrawl scaffold is wired and ready for the next activation step, but this analyzer still uses the built-in crawler today.' - : 'Firecrawl scaffold is wired, but FIRECRAWL_API_KEY is not set yet. The analyzer still uses the built-in crawler for now.', + availableForCurrentUser, + shouldUseFirecrawl, + usePaidOnly: true, + message: buildFirecrawlMessage(runtime, entitlements, requestedPages), + }; +} + +function sleep(milliseconds) { + return new Promise((resolve) => { + setTimeout(resolve, milliseconds); + }); +} + +function isAbsoluteUrl(value) { + return /^https?:\/\//i.test(String(value || '')); +} + +function buildApiUrl(runtime, pathOrUrl) { + const value = String(pathOrUrl || '').trim(); + + if (!value) { + return runtime.baseUrl; + } + + if (isAbsoluteUrl(value)) { + return value; + } + + if (value.startsWith('/')) { + return `${runtime.baseUrl}${value}`; + } + + return `${runtime.baseUrl}/${value}`; +} + +function summarizeFirecrawlPayload(payload) { + if (!payload) { + return 'Unknown Firecrawl API error.'; + } + + if (typeof payload === 'string') { + return payload; + } + + if (typeof payload?.error === 'string' && payload.error.trim()) { + return payload.error; + } + + if (typeof payload?.message === 'string' && payload.message.trim()) { + return payload.message; + } + + return 'Unexpected Firecrawl API response.'; +} + +async function firecrawlRequest(runtime, method, pathOrUrl, options = {}) { + try { + const response = await axios({ + method, + url: buildApiUrl(runtime, pathOrUrl), + timeout: options.timeout || runtime.timeoutMs, + data: options.data, + headers: { + Authorization: `Bearer ${runtime.apiKey}`, + 'Content-Type': 'application/json', + ...(options.headers || {}), + }, + }); + + return response.data; + } catch (error) { + if (axios.isAxiosError(error)) { + const payload = error.response?.data; + const detail = summarizeFirecrawlPayload(payload); + const status = error.response?.status; + const wrappedError = new Error( + status + ? `Firecrawl request failed with status ${status}: ${detail}` + : `Firecrawl request failed: ${detail}`, + ); + + wrappedError.code = status || 502; + wrappedError.response = payload; + throw wrappedError; + } + + throw error; + } +} + +async function collectPagedStatus(runtime, initialStatus) { + const documents = Array.isArray(initialStatus?.data) + ? [...initialStatus.data] + : []; + let nextUrl = initialStatus?.next || null; + + while (nextUrl) { + const nextStatus = await firecrawlRequest(runtime, 'get', nextUrl); + + if (Array.isArray(nextStatus?.data) && nextStatus.data.length > 0) { + documents.push(...nextStatus.data); + } + + nextUrl = nextStatus?.next || null; + } + + return { + ...initialStatus, + data: documents, + next: null, + }; +} + +async function waitForCrawlCompletion(runtime, crawlId) { + const deadline = Date.now() + runtime.timeoutMs; + + while (Date.now() <= deadline) { + const status = await firecrawlRequest(runtime, 'get', `/crawl/${encodeURIComponent(crawlId)}`); + + if (status?.status === 'completed' || status?.status === 'failed') { + return collectPagedStatus(runtime, status); + } + + await sleep(runtime.pollIntervalMs); + } + + const timeoutError = new Error( + `Firecrawl crawl timed out after ${Math.round(runtime.timeoutMs / 1000)} seconds.`, + ); + timeoutError.code = 504; + throw timeoutError; +} + +async function getCrawlErrors(runtime, crawlId) { + try { + return await firecrawlRequest(runtime, 'get', `/crawl/${encodeURIComponent(crawlId)}/errors`); + } catch (error) { + console.error('Failed to fetch Firecrawl crawl errors:', error); + return { + errors: [], + robotsBlocked: [], + }; + } +} + +async function crawlSiteWithFirecrawl(url, requestedPages) { + const runtime = getFirecrawlRuntime(); + + if (!runtime.enabled) { + const error = new Error('Firecrawl is disabled in this environment.'); + error.code = 503; + throw error; + } + + if (!runtime.configured) { + const error = new Error('Firecrawl API key is not configured.'); + error.code = 503; + throw error; + } + + const started = await firecrawlRequest(runtime, 'post', '/crawl', { + data: { + url, + limit: requestedPages, + sitemap: 'include', + crawlEntireDomain: true, + allowExternalLinks: false, + allowSubdomains: false, + ignoreQueryParameters: true, + scrapeOptions: { + formats: ['html'], + }, + }, + }); + + const crawlId = started?.id; + + if (!crawlId) { + const error = new Error('Firecrawl did not return a crawl job ID.'); + error.code = 502; + error.response = started; + throw error; + } + + const status = await waitForCrawlCompletion(runtime, crawlId); + const crawlErrors = await getCrawlErrors(runtime, crawlId); + + return { + crawlId, + provider: 'firecrawl', + status: status?.status || 'unknown', + total: status?.total || 0, + completed: status?.completed || 0, + creditsUsed: status?.creditsUsed || 0, + expiresAt: status?.expiresAt || null, + data: Array.isArray(status?.data) ? status.data : [], + errors: Array.isArray(crawlErrors?.errors) ? crawlErrors.errors : [], + robotsBlocked: Array.isArray(crawlErrors?.robotsBlocked) + ? crawlErrors.robotsBlocked + : [], }; } module.exports = { getFirecrawlRuntime, getFirecrawlScaffold, + crawlSiteWithFirecrawl, }; diff --git a/backend/src/services/siteEntitlements.js b/backend/src/services/siteEntitlements.js index ce3ec26..47e7c53 100644 --- a/backend/src/services/siteEntitlements.js +++ b/backend/src/services/siteEntitlements.js @@ -1,6 +1,6 @@ const ValidationError = require('./notifications/errors/validation'); -const BASIC_MAX_PAGES_PER_CRAWL = 1; +const BASIC_MAX_PAGES_PER_CRAWL = 25; const ADVANCED_MAX_PAGES_PER_CRAWL = 25; const ADVANCED_CRAWL_PERMISSION = 'USE_ADVANCED_CRAWL'; const PLATFORM_OUTPUT_PERMISSION = 'USE_PLATFORM_OUTPUT'; @@ -68,7 +68,7 @@ function ensureRequestedPagesAllowed(requestedPages, currentUser) { if (requestedPages > entitlements.maxPagesPerCrawl) { const error = new Error( - `Your current plan allows up to ${entitlements.maxPagesPerCrawl} page${entitlements.maxPagesPerCrawl === 1 ? '' : 's'} per crawl. Upgrade to Advanced Crawl to analyze ${requestedPages} pages.`, + `This analyzer supports up to ${entitlements.maxPagesPerCrawl} page${entitlements.maxPagesPerCrawl === 1 ? '' : 's'} per crawl. Reduce the requested page count to continue.`, ); error.code = 403; throw error; diff --git a/backend/src/services/sites.js b/backend/src/services/sites.js index 9ee0055..ac36a79 100644 --- a/backend/src/services/sites.js +++ b/backend/src/services/sites.js @@ -8,7 +8,7 @@ const { ensureRequestedPagesAllowed, ensurePlatformOutputAllowed, } = require('./siteEntitlements'); -const { getFirecrawlScaffold } = require('./firecrawl'); +const { getFirecrawlScaffold, crawlSiteWithFirecrawl } = require('./firecrawl'); const REQUEST_TIMEOUT = 15000; const PREVIEW_LIMIT = 5; @@ -271,6 +271,152 @@ function normalizeAllowedHostnames(allowedHostnames) { return new Set(); } +function normalizeTargetPathname(pathname) { + const trimmedPathname = String(pathname || '').trim(); + + if (!trimmedPathname || trimmedPathname === '/') { + return '/'; + } + + return `/${trimmedPathname.replace(/^\/+/, '').replace(/\/+$/, '')}`; +} + +function buildCrawlTarget(rawTarget, baseUrl, label) { + const trimmedTarget = String(rawTarget || '').trim(); + + if (!trimmedTarget) { + return null; + } + + let parsedTarget; + + try { + if (/^https?:\/\//i.test(trimmedTarget)) { + parsedTarget = new URL(trimmedTarget); + } else if (trimmedTarget.startsWith('/')) { + parsedTarget = new URL(trimmedTarget, baseUrl); + } else { + parsedTarget = new URL(`/${trimmedTarget.replace(/^\/+/, '')}`, baseUrl); + } + } catch (error) { + const targetError = new Error(`Invalid ${label} target: ${trimmedTarget}`); + targetError.code = 400; + throw targetError; + } + + if (!['http:', 'https:'].includes(parsedTarget.protocol)) { + const targetError = new Error(`Invalid ${label} target: ${trimmedTarget}`); + targetError.code = 400; + throw targetError; + } + + const baseHostname = new URL(baseUrl).hostname.toLowerCase(); + + if (parsedTarget.hostname.toLowerCase() !== baseHostname) { + const targetError = new Error( + `${label} targets must stay on the same website as the analyzed URL.`, + ); + targetError.code = 400; + throw targetError; + } + + parsedTarget.hash = ''; + parsedTarget.search = ''; + + const path = normalizeTargetPathname(parsedTarget.pathname); + const url = normalizeUrl(parsedTarget.toString()); + + return { + input: trimmedTarget, + label: /^https?:\/\//i.test(trimmedTarget) ? url : path, + path, + url, + }; +} + +function parseCrawlTargets(rawTargets, baseUrl, label) { + const targetValues = Array.isArray(rawTargets) + ? rawTargets + : String(rawTargets || '').split(/\r?\n/); + const dedupedTargets = new Map(); + + targetValues + .map((targetValue) => String(targetValue || '').trim()) + .filter(Boolean) + .forEach((targetValue) => { + const normalizedTarget = buildCrawlTarget(targetValue, baseUrl, label); + + dedupedTargets.set(normalizedTarget.url, normalizedTarget); + }); + + return Array.from(dedupedTargets.values()); +} + +function normalizeCrawlTargets(data, baseUrl) { + return { + includeTargets: parseCrawlTargets(data?.includeTargets, baseUrl, 'include'), + excludeTargets: parseCrawlTargets(data?.excludeTargets, baseUrl, 'exclude'), + }; +} + +function isUrlMatchingTarget(candidateUrl, target) { + if (!candidateUrl || !target?.path) { + return false; + } + + let parsedUrl; + + try { + parsedUrl = new URL(normalizeUrl(candidateUrl)); + } catch (error) { + return false; + } + + const candidatePath = normalizeTargetPathname(parsedUrl.pathname); + + if (target.path === '/') { + return true; + } + + return candidatePath === target.path || candidatePath.startsWith(`${target.path}/`); +} + +function matchesAnyCrawlTarget(candidateUrl, targets = []) { + return targets.some((target) => isUrlMatchingTarget(candidateUrl, target)); +} + +function isUrlAllowedByCrawlTargets(candidateUrl, crawlTargets = {}) { + const includeTargets = crawlTargets.includeTargets || []; + const excludeTargets = crawlTargets.excludeTargets || []; + + if (includeTargets.length > 0 && !matchesAnyCrawlTarget(candidateUrl, includeTargets)) { + return false; + } + + if (excludeTargets.length > 0 && matchesAnyCrawlTarget(candidateUrl, excludeTargets)) { + return false; + } + + return true; +} + +function buildSeedUrls(baseUrl, crawlTargets = {}) { + const seedUrls = new Set([baseUrl]); + + (crawlTargets.includeTargets || []).forEach((target) => { + seedUrls.add(target.url); + }); + + return Array.from(seedUrls); +} + +function summarizeCrawlTargets(crawlTargets = {}) { + return { + includeTargets: (crawlTargets.includeTargets || []).map((target) => target.label), + excludeTargets: (crawlTargets.excludeTargets || []).map((target) => target.label), + }; +} + function normalizeCrawlUrl(rawUrl, parentUrl, allowedHostnames) { if (!rawUrl || typeof rawUrl !== 'string') { return null; @@ -403,12 +549,184 @@ async function fetchAnalyzedPage(pageUrl, allowedHostnames) { }; } -async function crawlPages(baseUrl, requestedPages) { +function analyzeFetchedPage({ + requestedUrl, + analyzedUrl, + html, + statusCode, + headers = {}, + allowedHostnames, + discoveredLinks = null, + pageTitle = null, +}) { + const normalizedAnalyzedUrl = normalizeUrl(analyzedUrl || requestedUrl); + const normalizedAllowedHostnames = normalizeAllowedHostnames(allowedHostnames); + const analyzedHostname = new URL(normalizedAnalyzedUrl).hostname.toLowerCase(); + normalizedAllowedHostnames.add(analyzedHostname); + + if (allowedHostnames instanceof Set) { + allowedHostnames.add(analyzedHostname); + } + + const resolvedHtml = typeof html === 'string' ? html : ''; + const resolvedPageTitle = pageTitle || extractPageTitle(resolvedHtml); + const platform = detectPlatform(resolvedHtml, headers, normalizedAnalyzedUrl); + const schema = extractSchemaSummary(resolvedHtml); + const pageSignals = inferPageSignals( + resolvedHtml, + normalizedAnalyzedUrl, + resolvedPageTitle, + platform, + ); + const normalizedLinks = Array.isArray(discoveredLinks) + ? Array.from( + new Set( + discoveredLinks + .map((linkUrl) => normalizeCrawlUrl(linkUrl, normalizedAnalyzedUrl, normalizedAllowedHostnames)) + .filter(Boolean), + ), + ) + : extractInternalLinks( + resolvedHtml, + normalizedAnalyzedUrl, + normalizedAllowedHostnames, + ); + + return { + requestedUrl: requestedUrl || normalizedAnalyzedUrl, + analyzedUrl: normalizedAnalyzedUrl, + pageTitle: resolvedPageTitle, + statusCode: statusCode || null, + html: resolvedHtml, + platform, + schema, + pageSignals, + discoveredLinks: normalizedLinks, + }; +} + +function transformFirecrawlDocument(document, allowedHostnames) { + const metadata = document?.metadata || {}; + const sourceUrl = + metadata.sourceURL + || metadata.sourceUrl + || metadata.url + || document?.url + || document?.sourceURL + || document?.sourceUrl; + + if (!sourceUrl) { + return null; + } + + const html = + typeof document?.html === 'string' + ? document.html + : typeof document?.rawHtml === 'string' + ? document.rawHtml + : typeof document?.content === 'string' + ? document.content + : ''; + + return analyzeFetchedPage({ + requestedUrl: sourceUrl, + analyzedUrl: sourceUrl, + html, + statusCode: document?.metadata?.statusCode || 200, + headers: {}, + allowedHostnames, + discoveredLinks: Array.isArray(document?.links) ? document.links : null, + pageTitle: metadata.title || null, + }); +} + +async function crawlPagesWithFirecrawl(baseUrl, requestedPages, crawlTargets = {}) { const normalizedBaseUrl = normalizeUrl(baseUrl); const allowedHostnames = new Set([new URL(normalizedBaseUrl).hostname.toLowerCase()]); + const firecrawlResult = await crawlSiteWithFirecrawl(normalizedBaseUrl, requestedPages); + const pages = []; + const analyzedUrls = new Set(); + + (firecrawlResult.data || []).forEach((document) => { + try { + const page = transformFirecrawlDocument(document, allowedHostnames); + + if ( + !page + || analyzedUrls.has(page.analyzedUrl) + || !isUrlAllowedByCrawlTargets(page.analyzedUrl, crawlTargets) + ) { + return; + } + + analyzedUrls.add(page.analyzedUrl); + pages.push(page); + } catch (error) { + console.error('Failed to transform Firecrawl document:', error); + } + }); + + const failedPages = []; + + (firecrawlResult.errors || []).forEach((entry) => { + const failedUrl = normalizeCrawlUrl( + entry?.path || entry?.url || entry?.sourceURL || normalizedBaseUrl, + normalizedBaseUrl, + allowedHostnames, + ) || normalizedBaseUrl; + + if (!isUrlAllowedByCrawlTargets(failedUrl, crawlTargets)) { + return; + } + + failedPages.push({ + url: failedUrl, + error: entry?.error || entry?.message || 'Firecrawl could not fetch this page.', + }); + }); + + (firecrawlResult.robotsBlocked || []).forEach((entry) => { + const blockedUrl = normalizeCrawlUrl( + entry?.path || entry?.url || normalizedBaseUrl, + normalizedBaseUrl, + allowedHostnames, + ) || normalizedBaseUrl; + + if (!isUrlAllowedByCrawlTargets(blockedUrl, crawlTargets)) { + return; + } + + failedPages.push({ + url: blockedUrl, + error: 'Blocked by robots.txt during Firecrawl crawl.', + }); + }); + + return { + provider: 'firecrawl', + pages, + failedPages, + discoveredInternalPages: Math.max((firecrawlResult.total || pages.length) - 1, 0), + firecrawlJob: { + crawlId: firecrawlResult.crawlId, + status: firecrawlResult.status, + total: firecrawlResult.total, + completed: firecrawlResult.completed, + creditsUsed: firecrawlResult.creditsUsed, + expiresAt: firecrawlResult.expiresAt, + failedPages: failedPages.length, + }, + }; +} + +async function crawlPages(baseUrl, requestedPages, crawlTargets = {}) { + const normalizedBaseUrl = normalizeUrl(baseUrl); + const allowedHostnames = new Set([new URL(normalizedBaseUrl).hostname.toLowerCase()]); + const seedUrls = buildSeedUrls(normalizedBaseUrl, crawlTargets); + const seedUrlSet = new Set(seedUrls); const visitedUrls = new Set(); - const queuedUrls = new Set([normalizedBaseUrl]); - const pendingUrls = [normalizedBaseUrl]; + const queuedUrls = new Set(seedUrls); + const pendingUrls = [...seedUrls]; const pages = []; const failedPages = []; let discoveredInternalPages = 0; @@ -420,15 +738,29 @@ async function crawlPages(baseUrl, requestedPages) { continue; } + const isBootstrapSeed = seedUrlSet.has(nextUrl) && nextUrl === normalizedBaseUrl; + + if (!isBootstrapSeed && !isUrlAllowedByCrawlTargets(nextUrl, crawlTargets)) { + visitedUrls.add(nextUrl); + continue; + } + visitedUrls.add(nextUrl); try { const page = await fetchAnalyzedPage(nextUrl, allowedHostnames); visitedUrls.add(page.analyzedUrl); queuedUrls.add(page.analyzedUrl); - pages.push(page); + + if (isUrlAllowedByCrawlTargets(page.analyzedUrl, crawlTargets)) { + pages.push(page); + } page.discoveredLinks.forEach((linkUrl) => { + if (!isUrlAllowedByCrawlTargets(linkUrl, crawlTargets)) { + return; + } + if (!visitedUrls.has(linkUrl) && !queuedUrls.has(linkUrl)) { queuedUrls.add(linkUrl); pendingUrls.push(linkUrl); @@ -523,30 +855,29 @@ function buildCrawlNotice({ requestedPages, actualPagesAnalyzed, failedPages, - discoveredInternalPages, - firecrawl, + crawlTargetSummary, }) { - if (requestedPages <= 1) { - return null; + const parts = []; + + if (requestedPages > 1) { + parts.push( + `The crawl analyzed ${actualPagesAnalyzed} of ${requestedPages} requested page${requestedPages === 1 ? '' : 's'}.`, + ); } - const parts = [ - `Advanced crawl analyzed ${actualPagesAnalyzed} of ${requestedPages} requested page${requestedPages === 1 ? '' : 's'}.`, - ]; + if (actualPagesAnalyzed < requestedPages) { + parts.push('Fewer matching crawlable pages were found than requested.'); + } - if (discoveredInternalPages + 1 < requestedPages) { - parts.push('Fewer crawlable internal HTML pages were discovered than requested.'); + if ((crawlTargetSummary?.includeTargets || []).length > 0 || (crawlTargetSummary?.excludeTargets || []).length > 0) { + parts.push('Custom include/exclude targeting was applied to this report.'); } if (failedPages > 0) { parts.push(`${failedPages} page${failedPages === 1 ? '' : 's'} could not be fetched during the crawl.`); } - if (firecrawl?.message) { - parts.push(firecrawl.message); - } - - return parts.join(' '); + return parts.length > 0 ? parts.join(' ') : null; } function buildAggregateAnalysis({ @@ -557,11 +888,14 @@ function buildAggregateAnalysis({ discoveredInternalPages, failedPages, firecrawl, + crawlTargets, + provider = 'internal', }) { const homepage = pageAnalyses[0]; const finishedAt = new Date(); const aggregateSchema = buildAggregateSchema(pageAnalyses); const aggregateSignals = buildAggregateSignals(pageAnalyses); + const crawlTargetSummary = summarizeCrawlTargets(crawlTargets); return { requestedUrl: normalizedUrl, @@ -581,7 +915,9 @@ function buildAggregateAnalysis({ allowedPages: entitlements.maxPagesPerCrawl, actualPagesAnalyzed: pageAnalyses.length, advancedCrawlEnabled: entitlements.canAdvancedCrawl, - provider: 'internal', + provider, + includeTargets: crawlTargetSummary.includeTargets, + excludeTargets: crawlTargetSummary.excludeTargets, }, crawlSummary: { pagesWithStructuredData: pageAnalyses.filter((page) => page.schema?.hasStructuredData).length, @@ -607,14 +943,13 @@ function buildAggregateAnalysis({ requestedPages, actualPagesAnalyzed: pageAnalyses.length, failedPages: failedPages.length, - discoveredInternalPages, - firecrawl, + crawlTargetSummary, }), finishedAt, }; } -function buildFailureAnalysis(normalizedUrl, error, firecrawl) { +function buildFailureAnalysis(normalizedUrl, error, firecrawl, provider = 'internal') { const isAxiosError = axios.isAxiosError(error); return { @@ -633,6 +968,9 @@ function buildFailureAnalysis(normalizedUrl, error, firecrawl) { rdfa: { count: 0, detected: false }, }, firecrawl, + crawlPlan: { + provider, + }, error: isAxiosError ? error.response ? `Request failed with status ${error.response.status}` @@ -1123,7 +1461,8 @@ module.exports = class SitesService { const requestedPages = parseRequestedPages(data?.requestedPages); const entitlements = ensureRequestedPagesAllowed(requestedPages, currentUser); const normalizedUrl = normalizeUrl(data?.url || data?.base_url); - const firecrawl = getFirecrawlScaffold({ requestedPages, entitlements }); + const crawlTargets = normalizeCrawlTargets(data, normalizedUrl); + let firecrawl = getFirecrawlScaffold({ requestedPages, entitlements }); const requestedName = typeof data?.name === 'string' && data.name.trim() ? data.name.trim() @@ -1190,12 +1529,50 @@ module.exports = class SitesService { } try { - const crawlResult = await crawlPages(normalizedUrl, requestedPages); + let crawlResult; + + if (firecrawl.shouldUseFirecrawl) { + try { + crawlResult = await crawlPagesWithFirecrawl(normalizedUrl, requestedPages, crawlTargets); + firecrawl = { + ...firecrawl, + currentProvider: 'firecrawl', + crawlId: crawlResult.firecrawlJob?.crawlId || null, + crawlStatus: crawlResult.firecrawlJob?.status || null, + creditsUsed: crawlResult.firecrawlJob?.creditsUsed || 0, + message: crawlResult.firecrawlJob?.status === 'failed' + ? 'Firecrawl ran for this paid request, but the crawl reported failures. Partial results are shown when available.' + : 'Firecrawl handled this paid request with sitemap-aware, JavaScript-rendered crawling.', + }; + } catch (error) { + console.error('Firecrawl crawl failed, falling back to internal crawl:', error); + firecrawl = { + ...firecrawl, + currentProvider: 'internal', + status: 'fallback_internal_after_error', + shouldUseFirecrawl: false, + fallbackReason: error.message, + message: `Firecrawl was selected for this paid request but failed to run (${error.message}). The analyzer fell back to the built-in crawler.`, + }; + crawlResult = await crawlPages(normalizedUrl, requestedPages, crawlTargets); + } + } else { + crawlResult = await crawlPages(normalizedUrl, requestedPages, crawlTargets); + firecrawl = { + ...firecrawl, + currentProvider: 'internal', + }; + } + const pageAnalyses = crawlResult.pages; if (pageAnalyses.length === 0) { const firstFailure = crawlResult.failedPages[0]; - const error = new Error(firstFailure?.error || 'Site analysis failed.'); + const error = new Error( + crawlTargets.includeTargets.length > 0 || crawlTargets.excludeTargets.length > 0 + ? 'No pages matched the include/exclude targeting rules you entered.' + : firstFailure?.error || 'Site analysis failed.', + ); error.code = 400; throw error; } @@ -1208,6 +1585,8 @@ module.exports = class SitesService { discoveredInternalPages: crawlResult.discoveredInternalPages, failedPages: crawlResult.failedPages, firecrawl, + crawlTargets, + provider: crawlResult.provider || 'internal', }); const homepage = pageAnalyses[0]; const finishedAt = analysis.finishedAt; @@ -1291,7 +1670,12 @@ module.exports = class SitesService { } catch (error) { console.error('Site analysis failed:', error); - const failureAnalysis = buildFailureAnalysis(normalizedUrl, error, firecrawl); + const failureAnalysis = buildFailureAnalysis( + normalizedUrl, + error, + firecrawl, + firecrawl?.currentProvider || 'internal', + ); const failedAt = new Date(); const failureTransaction = await db.sequelize.transaction(); let failedSite; @@ -1351,7 +1735,7 @@ module.exports = class SitesService { allowedPages: entitlements.maxPagesPerCrawl, actualPagesAnalyzed: 0, advancedCrawlEnabled: entitlements.canAdvancedCrawl, - provider: 'internal', + provider: failureAnalysis.crawlPlan?.provider || 'internal', }, entitlements, }, diff --git a/frontend/src/helpers/siteEntitlements.ts b/frontend/src/helpers/siteEntitlements.ts index c0d14f9..1877880 100644 --- a/frontend/src/helpers/siteEntitlements.ts +++ b/frontend/src/helpers/siteEntitlements.ts @@ -1,6 +1,6 @@ import { hasPermission } from './userPermissions'; -export const BASIC_MAX_PAGES_PER_CRAWL = 1; +export const BASIC_MAX_PAGES_PER_CRAWL = 25; export const ADVANCED_MAX_PAGES_PER_CRAWL = 25; export const ADVANCED_CRAWL_PERMISSION = 'USE_ADVANCED_CRAWL'; export const PLATFORM_OUTPUT_PERMISSION = 'USE_PLATFORM_OUTPUT'; diff --git a/frontend/src/pages/sites/analyzer.tsx b/frontend/src/pages/sites/analyzer.tsx index 7d3bbf3..4656773 100644 --- a/frontend/src/pages/sites/analyzer.tsx +++ b/frontend/src/pages/sites/analyzer.tsx @@ -5,6 +5,7 @@ import React, { ReactElement } from 'react'; import { ToastContainer, toast } from 'react-toastify'; import BaseButton from '../../components/BaseButton'; import BaseButtons from '../../components/BaseButtons'; +import BaseIcon from '../../components/BaseIcon'; import CardBox from '../../components/CardBox'; import FormField from '../../components/FormField'; import LayoutAuthenticated from '../../layouts/Authenticated'; @@ -38,6 +39,8 @@ type AnalysisPayload = { actualPagesAnalyzed?: number; advancedCrawlEnabled?: boolean; provider?: string; + includeTargets?: string[]; + excludeTargets?: string[]; }; crawlSummary?: { pagesWithStructuredData?: number; @@ -66,7 +69,14 @@ type AnalysisPayload = { status?: string; wouldHandleJavascript?: boolean; wouldHandleSitemapDiscovery?: boolean; - shouldUseFirecrawlLater?: boolean; + availableForCurrentUser?: boolean; + shouldUseFirecrawl?: boolean; + usePaidOnly?: boolean; + currentProvider?: string; + crawlId?: string | null; + crawlStatus?: string | null; + creditsUsed?: number; + fallbackReason?: string; message?: string; }; platform?: { @@ -123,6 +133,25 @@ type ReportResponse = { error?: string; }; +type SetupSectionId = 'targeting' | 'options' | 'limits'; +type ResultsTabId = 'overview' | 'pages' | 'recommendations' | 'delivery'; + +type SetupAccordionSectionProps = { + title: string; + description: string; + badge?: React.ReactNode; + isOpen: boolean; + onToggle: () => void; + children: React.ReactNode; +}; + +type ResultsTabButtonProps = { + label: string; + count?: number | string; + isActive: boolean; + onClick: () => void; +}; + const PLATFORM_OPTIONS = [ { value: 'wordpress', label: 'WordPress' }, { value: 'shopify', label: 'Shopify' }, @@ -132,10 +161,77 @@ const PLATFORM_OPTIONS = [ const initialReport: ReportResponse | null = null; +const parseTargetLines = (value: string) => value + .split(/\r?\n/) + .map((entry) => entry.trim()) + .filter(Boolean); + +const SetupAccordionSection = ({ + title, + description, + badge, + isOpen, + onToggle, + children, +}: SetupAccordionSectionProps) => ( +
- Enter a domain or full URL. The app will detect the platform, crawl up to your allowed page limit, - inspect structured data across the discovered pages, generate rules-based schema recommendations, - and prepare developer-ready code snippets. + Enter a domain or full URL, choose how many pages to review, and optionally focus the report on the + folders, categories, or pages that matter most. This setup keeps the page cleaner on mobile while still + supporting up to {maxPagesPerCrawl} pages per crawl.
+- Advanced Crawl is now enforced and active. Premium still reserves Step 4 platform output. Firecrawl is scaffolded for sitemap + JS-rendered crawling, but not activated yet. -
-- Sitemap discovery and JS-rendered crawl are planned through Firecrawl. This environment is currently using the built-in crawler only. -
-- Export a developer handoff file or email the latest recommendations directly. -
-- High-level crawl and structured-data summary from the latest analysis run. -
-- Prioritized next actions with ready-to-copy schema where possible. -
-- {recommendation.reason} -
- {recommendation.expected_impact && ( -- Expected impact:{' '} - {recommendation.expected_impact} -
- )} -+ Keep this aligned with the CMS or platform your developer will implement against. +
+
- {recommendation.suggested_schema || 'No code snippet generated for this recommendation.'}
-
+ + Review the latest crawl summary, page-level findings, prioritized recommendations, and delivery actions from one mobile-friendly workspace. +
++ {recommendation.reason} +
+ {recommendation.expected_impact && ( ++ Expected impact:{' '} + {recommendation.expected_impact} +
+ )} +
+ {recommendation.suggested_schema || 'No code snippet generated for this recommendation.'}
+
+ + Export a developer handoff file, email the latest recommendations, or check Step 4 output for the selected platform. +
+