diff --git a/backend/.env b/backend/.env index ae9b07a..c8a42c3 100644 --- a/backend/.env +++ b/backend/.env @@ -12,3 +12,6 @@ EMAIL_USER=AKIAVEW7G4PQUBGM52OF EMAIL_PASS=BLnD4hKGb6YkSz3gaQrf8fnyLi3C3/EdjOOsLEDTDPTz SECRET_KEY=HUEyqESqgQ1yTwzVlO6wprC9Kf1J1xuA PEXELS_KEY=Vc99rnmOhHhJAbgGQoKLZtsaIVfkeownoQNbTj78VemUjKh08ZYRbf18 +FIRECRAWL_API_KEY=fc-409763513f6c458c9d1d09e460346b17 +FIRECRAWL_BASE_URL=https://api.firecrawl.dev/v2 +FIRECRAWL_ENABLED=true diff --git a/backend/src/services/firecrawl.js b/backend/src/services/firecrawl.js index 8a5fe02..75e712c 100644 --- a/backend/src/services/firecrawl.js +++ b/backend/src/services/firecrawl.js @@ -1,4 +1,49 @@ -const FIRECRAWL_DEFAULT_BASE_URL = 'https://api.firecrawl.dev/v1'; +const fs = require('fs'); +const path = require('path'); +const axios = require('axios'); + +const FIRECRAWL_DEFAULT_BASE_URL = 'https://api.firecrawl.dev/v2'; +const FIRECRAWL_DEFAULT_POLL_INTERVAL_MS = 2000; +const FIRECRAWL_DEFAULT_TIMEOUT_MS = 45000; + +const BACKEND_ENV_PATH = path.join(__dirname, '..', '..', '.env'); + +function readBackendEnvFile() { + try { + const raw = fs.readFileSync(BACKEND_ENV_PATH, 'utf8'); + + return raw.split(/\r?\n/).reduce((accumulator, line) => { + const trimmedLine = line.trim(); + + if (!trimmedLine || trimmedLine.startsWith('#')) { + return accumulator; + } + + const separatorIndex = trimmedLine.indexOf('='); + + if (separatorIndex === -1) { + return accumulator; + } + + const key = trimmedLine.slice(0, separatorIndex).trim(); + const value = trimmedLine.slice(separatorIndex + 1).trim(); + + accumulator[key] = value.replace(/^"|"$/g, '').replace(/^'|'$/g, ''); + return accumulator; + }, {}); + } catch (error) { + return {}; + } +} + +function getEnvValue(name) { + if (process.env[name] !== undefined && process.env[name] !== null && process.env[name] !== '') { + return process.env[name]; + } + + return readBackendEnvFile()[name]; +} + function toBoolean(value, defaultValue = false) { if (value === undefined || value === null || value === '') { @@ -22,12 +67,34 @@ function toBoolean(value, defaultValue = false) { return defaultValue; } +function toPositiveInteger(value, defaultValue) { + const parsed = Number(value); + + if (Number.isInteger(parsed) && parsed > 0) { + return parsed; + } + + return defaultValue; +} + +function normalizeBaseUrl(baseUrl) { + return String(baseUrl || FIRECRAWL_DEFAULT_BASE_URL) + .trim() + .replace(/\/+$/, ''); +} + function getFirecrawlRuntime() { - const apiKey = String(process.env.FIRECRAWL_API_KEY || '').trim(); - const baseUrl = String( - process.env.FIRECRAWL_BASE_URL || FIRECRAWL_DEFAULT_BASE_URL, - ).trim(); - const enabled = toBoolean(process.env.FIRECRAWL_ENABLED, true); + const apiKey = String(getEnvValue('FIRECRAWL_API_KEY') || '').trim(); + const baseUrl = normalizeBaseUrl(getEnvValue('FIRECRAWL_BASE_URL')); + const enabled = toBoolean(getEnvValue('FIRECRAWL_ENABLED'), true); + const pollIntervalMs = toPositiveInteger( + getEnvValue('FIRECRAWL_POLL_INTERVAL_MS'), + FIRECRAWL_DEFAULT_POLL_INTERVAL_MS, + ); + const timeoutMs = toPositiveInteger( + getEnvValue('FIRECRAWL_TIMEOUT_MS'), + FIRECRAWL_DEFAULT_TIMEOUT_MS, + ); return { provider: 'firecrawl', @@ -35,29 +102,255 @@ function getFirecrawlRuntime() { enabled, configured: Boolean(apiKey), hasApiKey: Boolean(apiKey), - mode: 'scaffold_only', + apiKey, + pollIntervalMs, + timeoutMs, + mode: enabled && apiKey ? 'active' : 'scaffold_only', }; } +function buildFirecrawlMessage(runtime, entitlements, requestedPages) { + if (!entitlements?.canAdvancedCrawl) { + return 'Firecrawl is reserved for paid Advanced Crawl users. This request will stay on the built-in crawler.'; + } + + if (!runtime.enabled) { + return 'Firecrawl is configured in code, but FIRECRAWL_ENABLED is turned off. Paid users will stay on the built-in crawler until it is enabled.'; + } + + if (!runtime.configured) { + return 'Firecrawl is enabled for paid users, but FIRECRAWL_API_KEY is missing. Falling back to the built-in crawler until the key is configured.'; + } + + return requestedPages > 1 + ? 'Paid Advanced Crawl users are routed through Firecrawl for sitemap-aware, JavaScript-rendered multi-page crawling.' + : 'Paid Advanced Crawl users are routed through Firecrawl for sitemap-aware, JavaScript-rendered crawling.'; +} + function getFirecrawlScaffold({ requestedPages, entitlements } = {}) { const runtime = getFirecrawlRuntime(); - const wantsAdvancedCrawl = Number(requestedPages || 1) > 1; - const advancedCrawlUnlocked = Boolean(entitlements?.canAdvancedCrawl); - const shouldUseFirecrawlLater = runtime.enabled && (wantsAdvancedCrawl || advancedCrawlUnlocked); + const availableForCurrentUser = Boolean(entitlements?.canAdvancedCrawl); + const shouldUseFirecrawl = Boolean( + availableForCurrentUser + && runtime.enabled + && runtime.configured, + ); return { - ...runtime, - status: runtime.configured ? 'ready_for_activation' : 'awaiting_api_key', + provider: 'firecrawl', + baseUrl: runtime.baseUrl, + enabled: runtime.enabled, + configured: runtime.configured, + hasApiKey: runtime.hasApiKey, + mode: shouldUseFirecrawl ? 'active' : runtime.mode, + status: shouldUseFirecrawl ? 'active_for_paid_users' : 'scaffold_only', wouldHandleJavascript: true, wouldHandleSitemapDiscovery: true, - shouldUseFirecrawlLater, - message: runtime.configured - ? 'Firecrawl scaffold is wired and ready for the next activation step, but this analyzer still uses the built-in crawler today.' - : 'Firecrawl scaffold is wired, but FIRECRAWL_API_KEY is not set yet. The analyzer still uses the built-in crawler for now.', + availableForCurrentUser, + shouldUseFirecrawl, + usePaidOnly: true, + message: buildFirecrawlMessage(runtime, entitlements, requestedPages), + }; +} + +function sleep(milliseconds) { + return new Promise((resolve) => { + setTimeout(resolve, milliseconds); + }); +} + +function isAbsoluteUrl(value) { + return /^https?:\/\//i.test(String(value || '')); +} + +function buildApiUrl(runtime, pathOrUrl) { + const value = String(pathOrUrl || '').trim(); + + if (!value) { + return runtime.baseUrl; + } + + if (isAbsoluteUrl(value)) { + return value; + } + + if (value.startsWith('/')) { + return `${runtime.baseUrl}${value}`; + } + + return `${runtime.baseUrl}/${value}`; +} + +function summarizeFirecrawlPayload(payload) { + if (!payload) { + return 'Unknown Firecrawl API error.'; + } + + if (typeof payload === 'string') { + return payload; + } + + if (typeof payload?.error === 'string' && payload.error.trim()) { + return payload.error; + } + + if (typeof payload?.message === 'string' && payload.message.trim()) { + return payload.message; + } + + return 'Unexpected Firecrawl API response.'; +} + +async function firecrawlRequest(runtime, method, pathOrUrl, options = {}) { + try { + const response = await axios({ + method, + url: buildApiUrl(runtime, pathOrUrl), + timeout: options.timeout || runtime.timeoutMs, + data: options.data, + headers: { + Authorization: `Bearer ${runtime.apiKey}`, + 'Content-Type': 'application/json', + ...(options.headers || {}), + }, + }); + + return response.data; + } catch (error) { + if (axios.isAxiosError(error)) { + const payload = error.response?.data; + const detail = summarizeFirecrawlPayload(payload); + const status = error.response?.status; + const wrappedError = new Error( + status + ? `Firecrawl request failed with status ${status}: ${detail}` + : `Firecrawl request failed: ${detail}`, + ); + + wrappedError.code = status || 502; + wrappedError.response = payload; + throw wrappedError; + } + + throw error; + } +} + +async function collectPagedStatus(runtime, initialStatus) { + const documents = Array.isArray(initialStatus?.data) + ? [...initialStatus.data] + : []; + let nextUrl = initialStatus?.next || null; + + while (nextUrl) { + const nextStatus = await firecrawlRequest(runtime, 'get', nextUrl); + + if (Array.isArray(nextStatus?.data) && nextStatus.data.length > 0) { + documents.push(...nextStatus.data); + } + + nextUrl = nextStatus?.next || null; + } + + return { + ...initialStatus, + data: documents, + next: null, + }; +} + +async function waitForCrawlCompletion(runtime, crawlId) { + const deadline = Date.now() + runtime.timeoutMs; + + while (Date.now() <= deadline) { + const status = await firecrawlRequest(runtime, 'get', `/crawl/${encodeURIComponent(crawlId)}`); + + if (status?.status === 'completed' || status?.status === 'failed') { + return collectPagedStatus(runtime, status); + } + + await sleep(runtime.pollIntervalMs); + } + + const timeoutError = new Error( + `Firecrawl crawl timed out after ${Math.round(runtime.timeoutMs / 1000)} seconds.`, + ); + timeoutError.code = 504; + throw timeoutError; +} + +async function getCrawlErrors(runtime, crawlId) { + try { + return await firecrawlRequest(runtime, 'get', `/crawl/${encodeURIComponent(crawlId)}/errors`); + } catch (error) { + console.error('Failed to fetch Firecrawl crawl errors:', error); + return { + errors: [], + robotsBlocked: [], + }; + } +} + +async function crawlSiteWithFirecrawl(url, requestedPages) { + const runtime = getFirecrawlRuntime(); + + if (!runtime.enabled) { + const error = new Error('Firecrawl is disabled in this environment.'); + error.code = 503; + throw error; + } + + if (!runtime.configured) { + const error = new Error('Firecrawl API key is not configured.'); + error.code = 503; + throw error; + } + + const started = await firecrawlRequest(runtime, 'post', '/crawl', { + data: { + url, + limit: requestedPages, + sitemap: 'include', + crawlEntireDomain: true, + allowExternalLinks: false, + allowSubdomains: false, + ignoreQueryParameters: true, + scrapeOptions: { + formats: ['html'], + }, + }, + }); + + const crawlId = started?.id; + + if (!crawlId) { + const error = new Error('Firecrawl did not return a crawl job ID.'); + error.code = 502; + error.response = started; + throw error; + } + + const status = await waitForCrawlCompletion(runtime, crawlId); + const crawlErrors = await getCrawlErrors(runtime, crawlId); + + return { + crawlId, + provider: 'firecrawl', + status: status?.status || 'unknown', + total: status?.total || 0, + completed: status?.completed || 0, + creditsUsed: status?.creditsUsed || 0, + expiresAt: status?.expiresAt || null, + data: Array.isArray(status?.data) ? status.data : [], + errors: Array.isArray(crawlErrors?.errors) ? crawlErrors.errors : [], + robotsBlocked: Array.isArray(crawlErrors?.robotsBlocked) + ? crawlErrors.robotsBlocked + : [], }; } module.exports = { getFirecrawlRuntime, getFirecrawlScaffold, + crawlSiteWithFirecrawl, }; diff --git a/backend/src/services/siteEntitlements.js b/backend/src/services/siteEntitlements.js index ce3ec26..47e7c53 100644 --- a/backend/src/services/siteEntitlements.js +++ b/backend/src/services/siteEntitlements.js @@ -1,6 +1,6 @@ const ValidationError = require('./notifications/errors/validation'); -const BASIC_MAX_PAGES_PER_CRAWL = 1; +const BASIC_MAX_PAGES_PER_CRAWL = 25; const ADVANCED_MAX_PAGES_PER_CRAWL = 25; const ADVANCED_CRAWL_PERMISSION = 'USE_ADVANCED_CRAWL'; const PLATFORM_OUTPUT_PERMISSION = 'USE_PLATFORM_OUTPUT'; @@ -68,7 +68,7 @@ function ensureRequestedPagesAllowed(requestedPages, currentUser) { if (requestedPages > entitlements.maxPagesPerCrawl) { const error = new Error( - `Your current plan allows up to ${entitlements.maxPagesPerCrawl} page${entitlements.maxPagesPerCrawl === 1 ? '' : 's'} per crawl. Upgrade to Advanced Crawl to analyze ${requestedPages} pages.`, + `This analyzer supports up to ${entitlements.maxPagesPerCrawl} page${entitlements.maxPagesPerCrawl === 1 ? '' : 's'} per crawl. Reduce the requested page count to continue.`, ); error.code = 403; throw error; diff --git a/backend/src/services/sites.js b/backend/src/services/sites.js index 9ee0055..ac36a79 100644 --- a/backend/src/services/sites.js +++ b/backend/src/services/sites.js @@ -8,7 +8,7 @@ const { ensureRequestedPagesAllowed, ensurePlatformOutputAllowed, } = require('./siteEntitlements'); -const { getFirecrawlScaffold } = require('./firecrawl'); +const { getFirecrawlScaffold, crawlSiteWithFirecrawl } = require('./firecrawl'); const REQUEST_TIMEOUT = 15000; const PREVIEW_LIMIT = 5; @@ -271,6 +271,152 @@ function normalizeAllowedHostnames(allowedHostnames) { return new Set(); } +function normalizeTargetPathname(pathname) { + const trimmedPathname = String(pathname || '').trim(); + + if (!trimmedPathname || trimmedPathname === '/') { + return '/'; + } + + return `/${trimmedPathname.replace(/^\/+/, '').replace(/\/+$/, '')}`; +} + +function buildCrawlTarget(rawTarget, baseUrl, label) { + const trimmedTarget = String(rawTarget || '').trim(); + + if (!trimmedTarget) { + return null; + } + + let parsedTarget; + + try { + if (/^https?:\/\//i.test(trimmedTarget)) { + parsedTarget = new URL(trimmedTarget); + } else if (trimmedTarget.startsWith('/')) { + parsedTarget = new URL(trimmedTarget, baseUrl); + } else { + parsedTarget = new URL(`/${trimmedTarget.replace(/^\/+/, '')}`, baseUrl); + } + } catch (error) { + const targetError = new Error(`Invalid ${label} target: ${trimmedTarget}`); + targetError.code = 400; + throw targetError; + } + + if (!['http:', 'https:'].includes(parsedTarget.protocol)) { + const targetError = new Error(`Invalid ${label} target: ${trimmedTarget}`); + targetError.code = 400; + throw targetError; + } + + const baseHostname = new URL(baseUrl).hostname.toLowerCase(); + + if (parsedTarget.hostname.toLowerCase() !== baseHostname) { + const targetError = new Error( + `${label} targets must stay on the same website as the analyzed URL.`, + ); + targetError.code = 400; + throw targetError; + } + + parsedTarget.hash = ''; + parsedTarget.search = ''; + + const path = normalizeTargetPathname(parsedTarget.pathname); + const url = normalizeUrl(parsedTarget.toString()); + + return { + input: trimmedTarget, + label: /^https?:\/\//i.test(trimmedTarget) ? url : path, + path, + url, + }; +} + +function parseCrawlTargets(rawTargets, baseUrl, label) { + const targetValues = Array.isArray(rawTargets) + ? rawTargets + : String(rawTargets || '').split(/\r?\n/); + const dedupedTargets = new Map(); + + targetValues + .map((targetValue) => String(targetValue || '').trim()) + .filter(Boolean) + .forEach((targetValue) => { + const normalizedTarget = buildCrawlTarget(targetValue, baseUrl, label); + + dedupedTargets.set(normalizedTarget.url, normalizedTarget); + }); + + return Array.from(dedupedTargets.values()); +} + +function normalizeCrawlTargets(data, baseUrl) { + return { + includeTargets: parseCrawlTargets(data?.includeTargets, baseUrl, 'include'), + excludeTargets: parseCrawlTargets(data?.excludeTargets, baseUrl, 'exclude'), + }; +} + +function isUrlMatchingTarget(candidateUrl, target) { + if (!candidateUrl || !target?.path) { + return false; + } + + let parsedUrl; + + try { + parsedUrl = new URL(normalizeUrl(candidateUrl)); + } catch (error) { + return false; + } + + const candidatePath = normalizeTargetPathname(parsedUrl.pathname); + + if (target.path === '/') { + return true; + } + + return candidatePath === target.path || candidatePath.startsWith(`${target.path}/`); +} + +function matchesAnyCrawlTarget(candidateUrl, targets = []) { + return targets.some((target) => isUrlMatchingTarget(candidateUrl, target)); +} + +function isUrlAllowedByCrawlTargets(candidateUrl, crawlTargets = {}) { + const includeTargets = crawlTargets.includeTargets || []; + const excludeTargets = crawlTargets.excludeTargets || []; + + if (includeTargets.length > 0 && !matchesAnyCrawlTarget(candidateUrl, includeTargets)) { + return false; + } + + if (excludeTargets.length > 0 && matchesAnyCrawlTarget(candidateUrl, excludeTargets)) { + return false; + } + + return true; +} + +function buildSeedUrls(baseUrl, crawlTargets = {}) { + const seedUrls = new Set([baseUrl]); + + (crawlTargets.includeTargets || []).forEach((target) => { + seedUrls.add(target.url); + }); + + return Array.from(seedUrls); +} + +function summarizeCrawlTargets(crawlTargets = {}) { + return { + includeTargets: (crawlTargets.includeTargets || []).map((target) => target.label), + excludeTargets: (crawlTargets.excludeTargets || []).map((target) => target.label), + }; +} + function normalizeCrawlUrl(rawUrl, parentUrl, allowedHostnames) { if (!rawUrl || typeof rawUrl !== 'string') { return null; @@ -403,12 +549,184 @@ async function fetchAnalyzedPage(pageUrl, allowedHostnames) { }; } -async function crawlPages(baseUrl, requestedPages) { +function analyzeFetchedPage({ + requestedUrl, + analyzedUrl, + html, + statusCode, + headers = {}, + allowedHostnames, + discoveredLinks = null, + pageTitle = null, +}) { + const normalizedAnalyzedUrl = normalizeUrl(analyzedUrl || requestedUrl); + const normalizedAllowedHostnames = normalizeAllowedHostnames(allowedHostnames); + const analyzedHostname = new URL(normalizedAnalyzedUrl).hostname.toLowerCase(); + normalizedAllowedHostnames.add(analyzedHostname); + + if (allowedHostnames instanceof Set) { + allowedHostnames.add(analyzedHostname); + } + + const resolvedHtml = typeof html === 'string' ? html : ''; + const resolvedPageTitle = pageTitle || extractPageTitle(resolvedHtml); + const platform = detectPlatform(resolvedHtml, headers, normalizedAnalyzedUrl); + const schema = extractSchemaSummary(resolvedHtml); + const pageSignals = inferPageSignals( + resolvedHtml, + normalizedAnalyzedUrl, + resolvedPageTitle, + platform, + ); + const normalizedLinks = Array.isArray(discoveredLinks) + ? Array.from( + new Set( + discoveredLinks + .map((linkUrl) => normalizeCrawlUrl(linkUrl, normalizedAnalyzedUrl, normalizedAllowedHostnames)) + .filter(Boolean), + ), + ) + : extractInternalLinks( + resolvedHtml, + normalizedAnalyzedUrl, + normalizedAllowedHostnames, + ); + + return { + requestedUrl: requestedUrl || normalizedAnalyzedUrl, + analyzedUrl: normalizedAnalyzedUrl, + pageTitle: resolvedPageTitle, + statusCode: statusCode || null, + html: resolvedHtml, + platform, + schema, + pageSignals, + discoveredLinks: normalizedLinks, + }; +} + +function transformFirecrawlDocument(document, allowedHostnames) { + const metadata = document?.metadata || {}; + const sourceUrl = + metadata.sourceURL + || metadata.sourceUrl + || metadata.url + || document?.url + || document?.sourceURL + || document?.sourceUrl; + + if (!sourceUrl) { + return null; + } + + const html = + typeof document?.html === 'string' + ? document.html + : typeof document?.rawHtml === 'string' + ? document.rawHtml + : typeof document?.content === 'string' + ? document.content + : ''; + + return analyzeFetchedPage({ + requestedUrl: sourceUrl, + analyzedUrl: sourceUrl, + html, + statusCode: document?.metadata?.statusCode || 200, + headers: {}, + allowedHostnames, + discoveredLinks: Array.isArray(document?.links) ? document.links : null, + pageTitle: metadata.title || null, + }); +} + +async function crawlPagesWithFirecrawl(baseUrl, requestedPages, crawlTargets = {}) { const normalizedBaseUrl = normalizeUrl(baseUrl); const allowedHostnames = new Set([new URL(normalizedBaseUrl).hostname.toLowerCase()]); + const firecrawlResult = await crawlSiteWithFirecrawl(normalizedBaseUrl, requestedPages); + const pages = []; + const analyzedUrls = new Set(); + + (firecrawlResult.data || []).forEach((document) => { + try { + const page = transformFirecrawlDocument(document, allowedHostnames); + + if ( + !page + || analyzedUrls.has(page.analyzedUrl) + || !isUrlAllowedByCrawlTargets(page.analyzedUrl, crawlTargets) + ) { + return; + } + + analyzedUrls.add(page.analyzedUrl); + pages.push(page); + } catch (error) { + console.error('Failed to transform Firecrawl document:', error); + } + }); + + const failedPages = []; + + (firecrawlResult.errors || []).forEach((entry) => { + const failedUrl = normalizeCrawlUrl( + entry?.path || entry?.url || entry?.sourceURL || normalizedBaseUrl, + normalizedBaseUrl, + allowedHostnames, + ) || normalizedBaseUrl; + + if (!isUrlAllowedByCrawlTargets(failedUrl, crawlTargets)) { + return; + } + + failedPages.push({ + url: failedUrl, + error: entry?.error || entry?.message || 'Firecrawl could not fetch this page.', + }); + }); + + (firecrawlResult.robotsBlocked || []).forEach((entry) => { + const blockedUrl = normalizeCrawlUrl( + entry?.path || entry?.url || normalizedBaseUrl, + normalizedBaseUrl, + allowedHostnames, + ) || normalizedBaseUrl; + + if (!isUrlAllowedByCrawlTargets(blockedUrl, crawlTargets)) { + return; + } + + failedPages.push({ + url: blockedUrl, + error: 'Blocked by robots.txt during Firecrawl crawl.', + }); + }); + + return { + provider: 'firecrawl', + pages, + failedPages, + discoveredInternalPages: Math.max((firecrawlResult.total || pages.length) - 1, 0), + firecrawlJob: { + crawlId: firecrawlResult.crawlId, + status: firecrawlResult.status, + total: firecrawlResult.total, + completed: firecrawlResult.completed, + creditsUsed: firecrawlResult.creditsUsed, + expiresAt: firecrawlResult.expiresAt, + failedPages: failedPages.length, + }, + }; +} + +async function crawlPages(baseUrl, requestedPages, crawlTargets = {}) { + const normalizedBaseUrl = normalizeUrl(baseUrl); + const allowedHostnames = new Set([new URL(normalizedBaseUrl).hostname.toLowerCase()]); + const seedUrls = buildSeedUrls(normalizedBaseUrl, crawlTargets); + const seedUrlSet = new Set(seedUrls); const visitedUrls = new Set(); - const queuedUrls = new Set([normalizedBaseUrl]); - const pendingUrls = [normalizedBaseUrl]; + const queuedUrls = new Set(seedUrls); + const pendingUrls = [...seedUrls]; const pages = []; const failedPages = []; let discoveredInternalPages = 0; @@ -420,15 +738,29 @@ async function crawlPages(baseUrl, requestedPages) { continue; } + const isBootstrapSeed = seedUrlSet.has(nextUrl) && nextUrl === normalizedBaseUrl; + + if (!isBootstrapSeed && !isUrlAllowedByCrawlTargets(nextUrl, crawlTargets)) { + visitedUrls.add(nextUrl); + continue; + } + visitedUrls.add(nextUrl); try { const page = await fetchAnalyzedPage(nextUrl, allowedHostnames); visitedUrls.add(page.analyzedUrl); queuedUrls.add(page.analyzedUrl); - pages.push(page); + + if (isUrlAllowedByCrawlTargets(page.analyzedUrl, crawlTargets)) { + pages.push(page); + } page.discoveredLinks.forEach((linkUrl) => { + if (!isUrlAllowedByCrawlTargets(linkUrl, crawlTargets)) { + return; + } + if (!visitedUrls.has(linkUrl) && !queuedUrls.has(linkUrl)) { queuedUrls.add(linkUrl); pendingUrls.push(linkUrl); @@ -523,30 +855,29 @@ function buildCrawlNotice({ requestedPages, actualPagesAnalyzed, failedPages, - discoveredInternalPages, - firecrawl, + crawlTargetSummary, }) { - if (requestedPages <= 1) { - return null; + const parts = []; + + if (requestedPages > 1) { + parts.push( + `The crawl analyzed ${actualPagesAnalyzed} of ${requestedPages} requested page${requestedPages === 1 ? '' : 's'}.`, + ); } - const parts = [ - `Advanced crawl analyzed ${actualPagesAnalyzed} of ${requestedPages} requested page${requestedPages === 1 ? '' : 's'}.`, - ]; + if (actualPagesAnalyzed < requestedPages) { + parts.push('Fewer matching crawlable pages were found than requested.'); + } - if (discoveredInternalPages + 1 < requestedPages) { - parts.push('Fewer crawlable internal HTML pages were discovered than requested.'); + if ((crawlTargetSummary?.includeTargets || []).length > 0 || (crawlTargetSummary?.excludeTargets || []).length > 0) { + parts.push('Custom include/exclude targeting was applied to this report.'); } if (failedPages > 0) { parts.push(`${failedPages} page${failedPages === 1 ? '' : 's'} could not be fetched during the crawl.`); } - if (firecrawl?.message) { - parts.push(firecrawl.message); - } - - return parts.join(' '); + return parts.length > 0 ? parts.join(' ') : null; } function buildAggregateAnalysis({ @@ -557,11 +888,14 @@ function buildAggregateAnalysis({ discoveredInternalPages, failedPages, firecrawl, + crawlTargets, + provider = 'internal', }) { const homepage = pageAnalyses[0]; const finishedAt = new Date(); const aggregateSchema = buildAggregateSchema(pageAnalyses); const aggregateSignals = buildAggregateSignals(pageAnalyses); + const crawlTargetSummary = summarizeCrawlTargets(crawlTargets); return { requestedUrl: normalizedUrl, @@ -581,7 +915,9 @@ function buildAggregateAnalysis({ allowedPages: entitlements.maxPagesPerCrawl, actualPagesAnalyzed: pageAnalyses.length, advancedCrawlEnabled: entitlements.canAdvancedCrawl, - provider: 'internal', + provider, + includeTargets: crawlTargetSummary.includeTargets, + excludeTargets: crawlTargetSummary.excludeTargets, }, crawlSummary: { pagesWithStructuredData: pageAnalyses.filter((page) => page.schema?.hasStructuredData).length, @@ -607,14 +943,13 @@ function buildAggregateAnalysis({ requestedPages, actualPagesAnalyzed: pageAnalyses.length, failedPages: failedPages.length, - discoveredInternalPages, - firecrawl, + crawlTargetSummary, }), finishedAt, }; } -function buildFailureAnalysis(normalizedUrl, error, firecrawl) { +function buildFailureAnalysis(normalizedUrl, error, firecrawl, provider = 'internal') { const isAxiosError = axios.isAxiosError(error); return { @@ -633,6 +968,9 @@ function buildFailureAnalysis(normalizedUrl, error, firecrawl) { rdfa: { count: 0, detected: false }, }, firecrawl, + crawlPlan: { + provider, + }, error: isAxiosError ? error.response ? `Request failed with status ${error.response.status}` @@ -1123,7 +1461,8 @@ module.exports = class SitesService { const requestedPages = parseRequestedPages(data?.requestedPages); const entitlements = ensureRequestedPagesAllowed(requestedPages, currentUser); const normalizedUrl = normalizeUrl(data?.url || data?.base_url); - const firecrawl = getFirecrawlScaffold({ requestedPages, entitlements }); + const crawlTargets = normalizeCrawlTargets(data, normalizedUrl); + let firecrawl = getFirecrawlScaffold({ requestedPages, entitlements }); const requestedName = typeof data?.name === 'string' && data.name.trim() ? data.name.trim() @@ -1190,12 +1529,50 @@ module.exports = class SitesService { } try { - const crawlResult = await crawlPages(normalizedUrl, requestedPages); + let crawlResult; + + if (firecrawl.shouldUseFirecrawl) { + try { + crawlResult = await crawlPagesWithFirecrawl(normalizedUrl, requestedPages, crawlTargets); + firecrawl = { + ...firecrawl, + currentProvider: 'firecrawl', + crawlId: crawlResult.firecrawlJob?.crawlId || null, + crawlStatus: crawlResult.firecrawlJob?.status || null, + creditsUsed: crawlResult.firecrawlJob?.creditsUsed || 0, + message: crawlResult.firecrawlJob?.status === 'failed' + ? 'Firecrawl ran for this paid request, but the crawl reported failures. Partial results are shown when available.' + : 'Firecrawl handled this paid request with sitemap-aware, JavaScript-rendered crawling.', + }; + } catch (error) { + console.error('Firecrawl crawl failed, falling back to internal crawl:', error); + firecrawl = { + ...firecrawl, + currentProvider: 'internal', + status: 'fallback_internal_after_error', + shouldUseFirecrawl: false, + fallbackReason: error.message, + message: `Firecrawl was selected for this paid request but failed to run (${error.message}). The analyzer fell back to the built-in crawler.`, + }; + crawlResult = await crawlPages(normalizedUrl, requestedPages, crawlTargets); + } + } else { + crawlResult = await crawlPages(normalizedUrl, requestedPages, crawlTargets); + firecrawl = { + ...firecrawl, + currentProvider: 'internal', + }; + } + const pageAnalyses = crawlResult.pages; if (pageAnalyses.length === 0) { const firstFailure = crawlResult.failedPages[0]; - const error = new Error(firstFailure?.error || 'Site analysis failed.'); + const error = new Error( + crawlTargets.includeTargets.length > 0 || crawlTargets.excludeTargets.length > 0 + ? 'No pages matched the include/exclude targeting rules you entered.' + : firstFailure?.error || 'Site analysis failed.', + ); error.code = 400; throw error; } @@ -1208,6 +1585,8 @@ module.exports = class SitesService { discoveredInternalPages: crawlResult.discoveredInternalPages, failedPages: crawlResult.failedPages, firecrawl, + crawlTargets, + provider: crawlResult.provider || 'internal', }); const homepage = pageAnalyses[0]; const finishedAt = analysis.finishedAt; @@ -1291,7 +1670,12 @@ module.exports = class SitesService { } catch (error) { console.error('Site analysis failed:', error); - const failureAnalysis = buildFailureAnalysis(normalizedUrl, error, firecrawl); + const failureAnalysis = buildFailureAnalysis( + normalizedUrl, + error, + firecrawl, + firecrawl?.currentProvider || 'internal', + ); const failedAt = new Date(); const failureTransaction = await db.sequelize.transaction(); let failedSite; @@ -1351,7 +1735,7 @@ module.exports = class SitesService { allowedPages: entitlements.maxPagesPerCrawl, actualPagesAnalyzed: 0, advancedCrawlEnabled: entitlements.canAdvancedCrawl, - provider: 'internal', + provider: failureAnalysis.crawlPlan?.provider || 'internal', }, entitlements, }, diff --git a/frontend/src/components/Logo/index.tsx b/frontend/src/components/Logo/index.tsx index a582e29..30747d0 100644 --- a/frontend/src/components/Logo/index.tsx +++ b/frontend/src/components/Logo/index.tsx @@ -1,3 +1,4 @@ +import Image from 'next/image' import React from 'react' type Props = { @@ -6,10 +7,12 @@ type Props = { export default function Logo({ className = '' }: Props) { return ( - {'Flatlogic - + alt="Flatlogic logo" + width={160} + height={32} + /> ) } diff --git a/frontend/src/helpers/siteEntitlements.ts b/frontend/src/helpers/siteEntitlements.ts index c0d14f9..1877880 100644 --- a/frontend/src/helpers/siteEntitlements.ts +++ b/frontend/src/helpers/siteEntitlements.ts @@ -1,6 +1,6 @@ import { hasPermission } from './userPermissions'; -export const BASIC_MAX_PAGES_PER_CRAWL = 1; +export const BASIC_MAX_PAGES_PER_CRAWL = 25; export const ADVANCED_MAX_PAGES_PER_CRAWL = 25; export const ADVANCED_CRAWL_PERMISSION = 'USE_ADVANCED_CRAWL'; export const PLATFORM_OUTPUT_PERMISSION = 'USE_PLATFORM_OUTPUT'; diff --git a/frontend/src/pages/profile.tsx b/frontend/src/pages/profile.tsx index f5eb7cf..000ed48 100644 --- a/frontend/src/pages/profile.tsx +++ b/frontend/src/pages/profile.tsx @@ -3,6 +3,7 @@ import { mdiUpload, } from '@mdi/js'; import Head from 'next/head'; +import Image from 'next/image'; import React, { ReactElement, useEffect, useState } from 'react'; import { ToastContainer, toast } from 'react-toastify'; import DatePicker from 'react-datepicker'; @@ -84,7 +85,13 @@ const EditUsers = () => { {currentUser?.avatar[0]?.publicUrl &&
- Avatar + Avatar
} void; + children: React.ReactNode; +}; + +type ResultsTabButtonProps = { + label: string; + iconPath: string; + count?: number | string; + isActive: boolean; + onClick: () => void; +}; + +type PageFilterChipProps = { + label: string; + count: number; + iconPath: string; + isActive: boolean; + onClick: () => void; +}; + +type DeliverySummaryCardProps = { + label: string; + value: string | number; + helper: string; + iconPath: string; + toneClassName?: string; +}; + +type DeliveryActionCardProps = { + title: string; + description: string; + iconPath: string; + badge?: React.ReactNode; + children: React.ReactNode; +}; + const PLATFORM_OPTIONS = [ { value: 'wordpress', label: 'WordPress' }, { value: 'shopify', label: 'Shopify' }, @@ -132,10 +189,267 @@ const PLATFORM_OPTIONS = [ const initialReport: ReportResponse | null = null; +const parseTargetLines = (value: string) => value + .split(/\r?\n/) + .map((entry) => entry.trim()) + .filter(Boolean); + +const recommendationPriorityOrder = ['critical', 'high', 'medium', 'low', 'other'] as const; +type RecommendationPriorityId = (typeof recommendationPriorityOrder)[number]; + +type RecommendationPriorityMeta = { + id: RecommendationPriorityId; + label: string; + sortOrder: number; + iconPath: string; + badgeClassName: string; + sectionTitle: string; + sectionDescription: string; + accentClassName: string; +}; + +const recommendationPriorityMetaMap: Record = { + critical: { + id: 'critical', + label: 'Critical', + sortOrder: 0, + iconPath: icon.mdiAlertCircleOutline, + badgeClassName: 'bg-rose-600 text-white dark:bg-rose-500 dark:text-white', + sectionTitle: 'Critical fixes first', + sectionDescription: 'Resolve these before anything else because they are the most urgent structured data gaps.', + accentClassName: 'border-rose-200 bg-rose-50/80 dark:border-rose-500/30 dark:bg-rose-500/10', + }, + high: { + id: 'high', + label: 'High', + sortOrder: 1, + iconPath: icon.mdiAlertOutline, + badgeClassName: 'bg-amber-500 text-white dark:bg-amber-400 dark:text-slate-950', + sectionTitle: 'High priority', + sectionDescription: 'These recommendations should be tackled early because they likely affect key pages or important schema coverage.', + accentClassName: 'border-amber-200 bg-amber-50/80 dark:border-amber-500/30 dark:bg-amber-500/10', + }, + medium: { + id: 'medium', + label: 'Medium', + sortOrder: 2, + iconPath: icon.mdiArrowDownCircleOutline, + badgeClassName: 'bg-sky-600 text-white dark:bg-sky-500 dark:text-white', + sectionTitle: 'Next up', + sectionDescription: 'Address these after the urgent items to improve broader coverage and quality.', + accentClassName: 'border-sky-200 bg-sky-50/80 dark:border-sky-500/30 dark:bg-sky-500/10', + }, + low: { + id: 'low', + label: 'Low', + sortOrder: 3, + iconPath: icon.mdiCheckCircleOutline, + badgeClassName: 'bg-emerald-600 text-white dark:bg-emerald-500 dark:text-white', + sectionTitle: 'Quick wins', + sectionDescription: 'Useful polish items that can be handled once higher-impact fixes are moving.', + accentClassName: 'border-emerald-200 bg-emerald-50/80 dark:border-emerald-500/30 dark:bg-emerald-500/10', + }, + other: { + id: 'other', + label: 'Unprioritized', + sortOrder: 4, + iconPath: icon.mdiLightbulbOutline, + badgeClassName: 'bg-slate-800 text-white dark:bg-slate-200 dark:text-slate-950', + sectionTitle: 'More opportunities', + sectionDescription: 'These are useful follow-up ideas that were not assigned a stronger priority label.', + accentClassName: 'border-slate-200 bg-slate-50/80 dark:border-slate-700 dark:bg-slate-900/40', + }, +}; + +const normalizeRecommendationPriority = (priority?: string): RecommendationPriorityId => { + const normalizedPriority = priority?.trim().toLowerCase(); + + if (!normalizedPriority) { + return 'other'; + } + + if (normalizedPriority.includes('critical') || normalizedPriority === 'p0') { + return 'critical'; + } + + if (normalizedPriority.includes('high') || normalizedPriority === 'p1') { + return 'high'; + } + + if (normalizedPriority.includes('medium') || normalizedPriority.includes('med') || normalizedPriority === 'p2') { + return 'medium'; + } + + if (normalizedPriority.includes('low') || normalizedPriority === 'p3') { + return 'low'; + } + + return 'other'; +}; + +const getRecommendationPriorityMeta = (priority?: string) => recommendationPriorityMetaMap[normalizeRecommendationPriority(priority)]; + +const isFixFirstRecommendation = (recommendation: Recommendation) => { + const priorityId = normalizeRecommendationPriority(recommendation.priority); + return priorityId === 'critical' || priorityId === 'high'; +}; + +const getRecommendationScopeSortOrder = (pageScope?: string) => { + const normalizedScope = pageScope?.trim().toLowerCase() || ''; + + if (!normalizedScope) { + return 4; + } + + if (normalizedScope.includes('site') || normalizedScope.includes('global') || normalizedScope.includes('all')) { + return 0; + } + + if (normalizedScope.includes('home')) { + return 1; + } + + if (normalizedScope.includes('template') || normalizedScope.includes('category') || normalizedScope.includes('collection') || normalizedScope.includes('product')) { + return 2; + } + + if (normalizedScope.includes('page')) { + return 3; + } + + return 4; +}; + +const SetupAccordionSection = ({ + title, + description, + iconPath, + badge, + isOpen, + onToggle, + children, +}: SetupAccordionSectionProps) => ( +
+ + + {isOpen && ( +
+ {children} +
+ )} +
+); + +const ResultsTabButton = ({ label, iconPath, count, isActive, onClick }: ResultsTabButtonProps) => ( + +); + +const PageFilterChip = ({ label, count, iconPath, isActive, onClick }: PageFilterChipProps) => ( + +); + +const DeliverySummaryCard = ({ + label, + value, + helper, + iconPath, + toneClassName = 'border-slate-200 bg-white dark:border-slate-700 dark:bg-slate-950/40', +}: DeliverySummaryCardProps) => ( +
+
+
+
{label}
+
{value}
+
+ + + +
+

{helper}

+
+); + +const DeliveryActionCard = ({ title, description, iconPath, badge, children }: DeliveryActionCardProps) => ( +
+
+
+ + + +
+
{title}
+

{description}

+
+
+ {badge} +
+
{children}
+
+); + const SchemaAnalyzerPage = () => { const { currentUser } = useAppSelector((state) => state.auth); const [url, setUrl] = React.useState(''); const [requestedPages, setRequestedPages] = React.useState(1); + const [includeTargets, setIncludeTargets] = React.useState(''); + const [excludeTargets, setExcludeTargets] = React.useState(''); const [selectedPlatform, setSelectedPlatform] = React.useState('wordpress'); const [emailTo, setEmailTo] = React.useState(currentUser?.email || ''); const [report, setReport] = React.useState(initialReport); @@ -144,6 +458,21 @@ const SchemaAnalyzerPage = () => { const [emailingId, setEmailingId] = React.useState(null); const [exportingId, setExportingId] = React.useState(null); const [isCheckingPlatformOutput, setIsCheckingPlatformOutput] = React.useState(false); + const [openSections, setOpenSections] = React.useState>({ + targeting: false, + options: false, + limits: false, + }); + const [activeResultsTab, setActiveResultsTab] = React.useState('overview'); + const [activePageFilter, setActivePageFilter] = React.useState('all'); + const [activeRecommendationFilter, setActiveRecommendationFilter] = React.useState('all'); + const [isFailedPagesExpanded, setIsFailedPagesExpanded] = React.useState(false); + const [expandedRecommendationIds, setExpandedRecommendationIds] = React.useState>({}); + const resultsRef = React.useRef(null); + + const scrollToResults = React.useCallback(() => { + resultsRef.current?.scrollIntoView({ behavior: 'smooth', block: 'start' }); + }, []); React.useEffect(() => { if (currentUser?.email) { @@ -151,6 +480,24 @@ const SchemaAnalyzerPage = () => { } }, [currentUser?.email]); + React.useEffect(() => { + if (!report?.analysis) { + return undefined; + } + + setActiveResultsTab('overview'); + setActivePageFilter('all'); + setActiveRecommendationFilter('all'); + setIsFailedPagesExpanded(false); + setExpandedRecommendationIds({}); + + const timeoutId = window.setTimeout(() => { + scrollToResults(); + }, 150); + + return () => window.clearTimeout(timeoutId); + }, [report?.analysis?.analyzedUrl, report?.analysis?.fetchedAt, scrollToResults]); + const notify = React.useCallback((type: 'success' | 'error' | 'info', message: string) => { toast(message, { type, position: 'bottom-center' }); }, []); @@ -160,18 +507,289 @@ const SchemaAnalyzerPage = () => { [currentUser], ); const entitlements = report?.entitlements || report?.analysis?.entitlements || fallbackEntitlements; - const maxPagesPerCrawl = entitlements?.maxPagesPerCrawl || 1; + const maxPagesPerCrawl = entitlements?.maxPagesPerCrawl || fallbackEntitlements.maxPagesPerCrawl || 25; const recommendations = report?.recommendations || []; const exportableRecommendations = recommendations.filter( (recommendation) => recommendation.suggested_schema, ); + const sortedRecommendations = React.useMemo(() => ( + [...recommendations].sort((leftRecommendation, rightRecommendation) => { + const leftPriority = getRecommendationPriorityMeta(leftRecommendation.priority); + const rightPriority = getRecommendationPriorityMeta(rightRecommendation.priority); + + if (leftPriority.sortOrder !== rightPriority.sortOrder) { + return leftPriority.sortOrder - rightPriority.sortOrder; + } + + const leftScopeOrder = getRecommendationScopeSortOrder(leftRecommendation.page_scope); + const rightScopeOrder = getRecommendationScopeSortOrder(rightRecommendation.page_scope); + if (leftScopeOrder !== rightScopeOrder) { + return leftScopeOrder - rightScopeOrder; + } + + const leftHasCode = Number(Boolean(leftRecommendation.suggested_schema)); + const rightHasCode = Number(Boolean(rightRecommendation.suggested_schema)); + if (leftHasCode !== rightHasCode) { + return rightHasCode - leftHasCode; + } + + return leftRecommendation.title.localeCompare(rightRecommendation.title); + }) + ), [recommendations]); + const crawlPlan = report?.analysis?.crawlPlan; const isRequestedPagesOverLimit = requestedPages > maxPagesPerCrawl; - const firecrawlStatus = report?.analysis?.firecrawl || { - provider: 'firecrawl', - configured: false, - wouldHandleJavascript: true, - wouldHandleSitemapDiscovery: true, - message: 'Firecrawl scaffold is wired in code, but this environment still needs a FIRECRAWL_API_KEY before activation.', + const draftIncludeTargets = React.useMemo(() => parseTargetLines(includeTargets), [includeTargets]); + const draftExcludeTargets = React.useMemo(() => parseTargetLines(excludeTargets), [excludeTargets]); + const appliedIncludeTargets = crawlPlan?.includeTargets || draftIncludeTargets; + const appliedExcludeTargets = crawlPlan?.excludeTargets || draftExcludeTargets; + const analyzedPages = report?.analysis?.pages || []; + const failedPages = report?.analysis?.failedPages || []; + const jsonLdTypes = report?.analysis?.schema?.jsonLd?.types || []; + const invalidJsonLdBlocks = report?.analysis?.schema?.jsonLd?.invalidBlocks || []; + const hasTargetingRules = appliedIncludeTargets.length > 0 || appliedExcludeTargets.length > 0; + const selectedPlatformLabel = PLATFORM_OPTIONS.find( + (platformOption) => platformOption.value === selectedPlatform, + )?.label || 'Custom / Other'; + const analyzedTimestamp = report?.analysis?.fetchedAt + ? new Date(report.analysis.fetchedAt).toLocaleString() + : null; + const hasEmailRecipient = emailTo.trim().length > 0; + const hasUrl = url.trim().length > 0; + const targetingSummary = hasTargetingRules + ? `${appliedIncludeTargets.length} include · ${appliedExcludeTargets.length} exclude` + : 'No targeting rules'; + const recommendationQuickFilters = [ + { + id: 'all' as const, + label: 'All', + count: recommendations.length, + iconPath: icon.mdiViewListOutline, + }, + { + id: 'fixFirst' as const, + label: 'Fix first', + count: sortedRecommendations.filter((recommendation) => isFixFirstRecommendation(recommendation)).length, + iconPath: icon.mdiAlertCircleOutline, + }, + { + id: 'codeReady' as const, + label: 'Code ready', + count: sortedRecommendations.filter((recommendation) => recommendation.suggested_schema).length, + iconPath: icon.mdiCodeBraces, + }, + { + id: 'needsCode' as const, + label: 'Needs code', + count: sortedRecommendations.filter((recommendation) => !recommendation.suggested_schema).length, + iconPath: icon.mdiLightbulbOutline, + }, + ]; + const pageFilterOptions = [ + { + id: 'all' as const, + label: 'All', + count: analyzedPages.length + failedPages.length, + iconPath: icon.mdiViewListOutline, + }, + { + id: 'withSchema' as const, + label: 'With schema', + count: analyzedPages.filter((page) => page.hasStructuredData).length, + iconPath: icon.mdiCheckCircleOutline, + }, + { + id: 'missingSchema' as const, + label: 'Missing schema', + count: analyzedPages.filter((page) => !page.hasStructuredData).length, + iconPath: icon.mdiAlertCircleOutline, + }, + { + id: 'failed' as const, + label: 'Failed', + count: failedPages.length, + iconPath: icon.mdiCloseCircleOutline, + }, + ]; + const filteredRecommendations = React.useMemo(() => { + if (activeRecommendationFilter === 'fixFirst') { + return sortedRecommendations.filter((recommendation) => isFixFirstRecommendation(recommendation)); + } + + if (activeRecommendationFilter === 'codeReady') { + return sortedRecommendations.filter((recommendation) => recommendation.suggested_schema); + } + + if (activeRecommendationFilter === 'needsCode') { + return sortedRecommendations.filter((recommendation) => !recommendation.suggested_schema); + } + + return sortedRecommendations; + }, [activeRecommendationFilter, sortedRecommendations]); + const recommendationGroups = React.useMemo(() => recommendationPriorityOrder + .map((priorityId) => ({ + meta: recommendationPriorityMetaMap[priorityId], + recommendations: filteredRecommendations.filter( + (recommendation) => normalizeRecommendationPriority(recommendation.priority) === priorityId, + ), + })) + .filter((group) => group.recommendations.length > 0), [filteredRecommendations]); + const activeRecommendationFilterLabel = recommendationQuickFilters.find( + (filterOption) => filterOption.id === activeRecommendationFilter, + )?.label || 'All'; + const recommendationEmptyStateMessage = activeRecommendationFilter === 'fixFirst' + ? 'No high-priority recommendations are waiting in this report.' + : activeRecommendationFilter === 'codeReady' + ? 'No recommendations with generated code are available yet.' + : activeRecommendationFilter === 'needsCode' + ? 'Every visible recommendation already has a code snippet attached.' + : 'No recommendations were generated for this page yet.'; + + const filteredAnalyzedPages = React.useMemo(() => { + if (activePageFilter === 'withSchema') { + return analyzedPages.filter((page) => page.hasStructuredData); + } + + if (activePageFilter === 'missingSchema') { + return analyzedPages.filter((page) => !page.hasStructuredData); + } + + if (activePageFilter === 'failed') { + return []; + } + + return analyzedPages; + }, [activePageFilter, analyzedPages]); + const shouldShowFailedSection = failedPages.length > 0 && (activePageFilter === 'all' || activePageFilter === 'failed'); + const emptyPagesStateMessage = activePageFilter === 'failed' + ? 'No failed internal pages were recorded for this analysis run.' + : activePageFilter === 'withSchema' + ? 'No analyzed pages with structured data match this filter yet.' + : activePageFilter === 'missingSchema' + ? 'No analyzed pages are missing structured data for this run.' + : 'No page-level results are available yet for this analysis run.'; + const deliverySummaryCards = [ + { + label: 'Code-ready fixes', + value: exportableRecommendations.length, + helper: exportableRecommendations.length > 0 + ? `${exportableRecommendations.length} recommendation${exportableRecommendations.length === 1 ? '' : 's'} can be exported right now.` + : 'No code-ready recommendations yet. Use the Recommendations tab to refine the handoff.', + iconPath: icon.mdiCodeBraces, + toneClassName: exportableRecommendations.length > 0 + ? 'border-emerald-200 bg-emerald-50/80 dark:border-emerald-500/30 dark:bg-emerald-500/10' + : 'border-slate-200 bg-white dark:border-slate-700 dark:bg-slate-950/40', + }, + { + label: 'Email recipient', + value: hasEmailRecipient ? 'Ready' : 'Missing', + helper: hasEmailRecipient ? emailTo.trim() : 'Add a developer email before sending the handoff.', + iconPath: hasEmailRecipient ? icon.mdiEmailOutline : icon.mdiAlertCircleOutline, + toneClassName: hasEmailRecipient + ? 'border-sky-200 bg-sky-50/80 dark:border-sky-500/30 dark:bg-sky-500/10' + : 'border-amber-200 bg-amber-50/80 dark:border-amber-500/30 dark:bg-amber-500/10', + }, + { + label: 'Platform output', + value: entitlements?.canPlatformOutput ? 'Unlocked' : 'Premium', + helper: entitlements?.canPlatformOutput + ? `${selectedPlatformLabel} output can be checked in Step 4.` + : 'Premium is required for Step 4 platform-specific output.', + iconPath: entitlements?.canPlatformOutput ? icon.mdiCheckCircleOutline : icon.mdiLockOutline, + toneClassName: entitlements?.canPlatformOutput + ? 'border-violet-200 bg-violet-50/80 dark:border-violet-500/30 dark:bg-violet-500/10' + : 'border-slate-200 bg-white dark:border-slate-700 dark:bg-slate-950/40', + }, + ]; + const deliveryChecklist = [ + { + id: 'recipient', + label: 'Recipient email', + value: hasEmailRecipient ? emailTo.trim() : 'Add an email to send the handoff.', + isReady: hasEmailRecipient, + }, + { + id: 'export', + label: 'Export package', + value: exportableRecommendations.length > 0 + ? `${exportableRecommendations.length} code-ready recommendation${exportableRecommendations.length === 1 ? '' : 's'} available.` + : 'Export all still works, but no generated code is attached yet.', + isReady: Boolean(report?.site?.id), + }, + { + id: 'platform', + label: 'Step 4 output', + value: entitlements?.canPlatformOutput + ? `${selectedPlatformLabel} output is available for this workspace.` + : `${selectedPlatformLabel} output requires Premium access.`, + isReady: Boolean(entitlements?.canPlatformOutput), + }, + ]; + const overviewStats = [ + { + label: 'Pages analyzed', + value: crawlPlan?.actualPagesAnalyzed || analyzedPages.length || 0, + helper: 'Crawl total', + iconPath: icon.mdiFileDocumentOutline, + }, + { + label: 'Recommendations', + value: recommendations.length, + helper: 'Next actions', + iconPath: icon.mdiLightbulbOutline, + }, + { + label: 'Structured data', + value: report?.analysis?.crawlSummary?.pagesWithStructuredData ?? (report?.analysis?.schema?.hasStructuredData ? 1 : 0), + helper: 'Pages with schema', + iconPath: icon.mdiCheckCircleOutline, + }, + { + label: 'JSON-LD blocks', + value: report?.analysis?.schema?.jsonLd?.count || 0, + helper: 'Detected snippets', + iconPath: icon.mdiCodeJson, + }, + { + label: 'Failed fetches', + value: report?.analysis?.crawlSummary?.failedPages ?? failedPages.length, + helper: 'Needs follow-up', + iconPath: icon.mdiAlertCircleOutline, + }, + { + label: 'Invalid blocks', + value: invalidJsonLdBlocks.length, + helper: 'Needs cleanup', + iconPath: icon.mdiAlertOutline, + }, + ]; + + React.useEffect(() => { + if ((draftIncludeTargets.length > 0 || draftExcludeTargets.length > 0) && !openSections.targeting) { + setOpenSections((currentSections) => ({ + ...currentSections, + targeting: true, + })); + } + }, [draftExcludeTargets.length, draftIncludeTargets.length, openSections.targeting]); + + React.useEffect(() => { + if (activePageFilter === 'failed' && failedPages.length > 0) { + setIsFailedPagesExpanded(true); + } + }, [activePageFilter, failedPages.length]); + + const toggleSection = (section: SetupSectionId) => { + setOpenSections((currentSections) => ({ + ...currentSections, + [section]: !currentSections[section], + })); + }; + + const toggleRecommendationCode = (recommendationId: string) => { + setExpandedRecommendationIds((currentIds) => ({ + ...currentIds, + [recommendationId]: !currentIds[recommendationId], + })); }; const handleAnalyze = async () => { @@ -183,7 +801,7 @@ const SchemaAnalyzerPage = () => { if (isRequestedPagesOverLimit) { notify( 'error', - `Your current plan allows up to ${maxPagesPerCrawl} page${maxPagesPerCrawl === 1 ? '' : 's'} per crawl. Upgrade to Advanced Crawl to go beyond that limit.`, + `This analyzer supports up to ${maxPagesPerCrawl} page${maxPagesPerCrawl === 1 ? '' : 's'} per crawl. Reduce the page count to continue.`, ); return; } @@ -193,6 +811,8 @@ const SchemaAnalyzerPage = () => { const response = await axios.post('/sites/analyze', { url: url.trim(), requestedPages, + includeTargets, + excludeTargets, }); setReport(response.data); @@ -352,8 +972,6 @@ const SchemaAnalyzerPage = () => { } }; - const crawlPlan = report?.analysis?.crawlPlan; - return ( <> @@ -368,39 +986,39 @@ const SchemaAnalyzerPage = () => { {''} - -
+ +

Analyze a customer site

-

- Enter a domain or full URL. The app will detect the platform, crawl up to your allowed page limit, - inspect structured data across the discovered pages, generate rules-based schema recommendations, - and prepare developer-ready code snippets. +

+ Enter a domain or full URL, choose how many pages to review, and optionally focus the report on the + folders, categories, or pages that matter most. This setup keeps the page cleaner on mobile while still + supporting up to {maxPagesPerCrawl} pages per crawl.

+
-
- - setUrl(event.target.value)} - onKeyDown={(event) => { - if (event.key === 'Enter') { - event.preventDefault(); - handleAnalyze().catch(() => null); - } - }} - /> - -
+
+ + setUrl(event.target.value)} + onKeyDown={(event) => { + if (event.key === 'Enter') { + event.preventDefault(); + handleAnalyze().catch(() => null); + } + }} + /> + -
+
{ /> - - - -
- - {isRequestedPagesOverLimit && ( -
- You requested {requestedPages} pages, but this account is capped at {maxPagesPerCrawl}. Upgrade to - Advanced Crawl to raise that limit. +
+
Quick setup
+
Pick a page count here, then use Target pages below if you want a more focused report.
- )} - - - { - handleAnalyze().catch(() => null); - }} - /> - { - const combined = exportableRecommendations - .map((recommendation) => recommendation.suggested_schema) - .filter(Boolean) - .join('\n\n'); - navigator.clipboard - .writeText(combined) - .then(() => notify('success', 'All schema code copied to clipboard.')) - .catch((error) => { - console.error('Copy all code failed:', error); - notify('error', 'Unable to copy the combined code.'); - }); - }} - /> - { - handlePlatformOutputCheck().catch(() => null); - }} - /> - +
-
-
-
-
-

Paywall status

-

- Advanced Crawl is now enforced and active. Premium still reserves Step 4 platform output. Firecrawl is scaffolded for sitemap + JS-rendered crawling, but not activated yet. -

-
- - {entitlements?.canPlatformOutput ? 'Premium access' : 'Basic access'} - -
- -
-
- Advanced crawl entitlement - - {entitlements?.canAdvancedCrawl ? 'Unlocked' : 'Locked'} - -
-
- Max pages per crawl - {maxPagesPerCrawl} -
-
- Platform-specific Step 4 output - - {entitlements?.canPlatformOutput ? 'Reserved' : 'Premium only'} - -
-
+ {isRequestedPagesOverLimit && ( +
+ You requested {requestedPages} pages, but this analyzer is capped at {maxPagesPerCrawl}. Reduce the page count to continue.
+ )} -
-
-
-

Firecrawl scaffold

-

- Sitemap discovery and JS-rendered crawl are planned through Firecrawl. This environment is currently using the built-in crawler only. -

-
- - {firecrawlStatus?.configured ? 'Key detected' : 'API key needed'} - -
+
+ toggleSection('targeting')} + > +
+ +