diff --git a/backend/src/services/sites.js b/backend/src/services/sites.js index e1e4b55..c771f34 100644 --- a/backend/src/services/sites.js +++ b/backend/src/services/sites.js @@ -13,6 +13,102 @@ const { getFirecrawlScaffold, crawlSiteWithFirecrawl } = require('./firecrawl'); const REQUEST_TIMEOUT = 15000; const PREVIEW_LIMIT = 5; const NON_HTML_FILE_PATTERN = /\.(?:7z|avi|bmp|css|csv|docx?|eot|gif|ico|jpe?g|js|json|map|mov|mp3|mp4|pdf|png|pptx?|rar|svg|tar|tgz|txt|wav|webm|webp|woff2?|xlsx?|xml|zip)$/i; +const WORDPRESS_PLUGIN_DEFINITIONS = [ + { + key: 'yoast', + label: 'Yoast SEO', + category: 'seo', + priority: 100, + patterns: [ + 'wp-content/plugins/wordpress-seo/', + 'class="yoast', + 'id="yoast', + 'yoast-schema-graph', + 'yoast_head', + ], + }, + { + key: 'rankmath', + label: 'Rank Math', + category: 'seo', + priority: 95, + patterns: [ + 'wp-content/plugins/seo-by-rank-math/', + 'rank-math', + 'rank_math', + ], + }, + { + key: 'aioseo', + label: 'All in One SEO', + category: 'seo', + priority: 90, + patterns: [ + 'wp-content/plugins/all-in-one-seo-pack/', + 'wp-content/plugins/aioseo/', + 'aioseo', + ], + }, + { + key: 'seopress', + label: 'SEOPress', + category: 'seo', + priority: 85, + patterns: [ + 'wp-content/plugins/wp-seopress/', + 'seopress', + ], + }, + { + key: 'woocommerce', + label: 'WooCommerce', + category: 'commerce', + priority: 80, + patterns: [ + 'wp-content/plugins/woocommerce/', + 'woocommerce', + 'wc-block-components', + 'single-product', + 'product-type-', + 'add_to_cart', + ], + }, + { + key: 'elementor', + label: 'Elementor', + category: 'builder', + priority: 70, + patterns: [ + 'wp-content/plugins/elementor/', + 'elementor-', + 'data-elementor', + ], + }, + { + key: 'wpbakery', + label: 'WPBakery', + category: 'builder', + priority: 65, + patterns: [ + 'wp-content/plugins/js_composer/', + 'wpb_js_composer', + 'vc_row', + 'wpb-content-wrapper', + ], + }, + { + key: 'acf', + label: 'Advanced Custom Fields', + category: 'fields', + priority: 60, + patterns: [ + 'wp-content/plugins/advanced-custom-fields/', + 'wp-content/plugins/acf/', + 'acf-field', + 'acf/', + ], + }, +]; function normalizeUrl(rawUrl) { if (!rawUrl || typeof rawUrl !== 'string') { @@ -86,15 +182,72 @@ function addJsonLdTypes(node, types) { normalizedTypes .filter(Boolean) - .forEach((type) => types.add(String(type))); + .forEach((type) => addSchemaTypeValue(type, types)); } Object.values(node).forEach((value) => addJsonLdTypes(value, types)); } +function normalizeSchemaTypeLabel(value) { + const trimmedValue = String(value || '').trim(); + + if (!trimmedValue) { + return null; + } + + const withoutAngleBrackets = trimmedValue.replace(/^<|>$/g, ''); + const withoutSchemaPrefix = withoutAngleBrackets + .replace(/^https?:\/\/(?:www\.)?schema\.org\//i, '') + .replace(/^schema:/i, '') + .replace(/^https?:\/\//i, ''); + const withoutFragment = withoutSchemaPrefix.split('#').pop() || withoutSchemaPrefix; + const withoutQuery = withoutFragment.split('?')[0] || withoutFragment; + const normalized = withoutQuery + .split('/') + .filter(Boolean) + .pop(); + + return normalized ? normalized.trim() : null; +} + +function extractAttributeValues(html, attributeName) { + const matches = [ + ...String(html || '').matchAll( + new RegExp("\\b" + attributeName + "\\s*=\\s*(?:\"([^\"]+)\"|'([^']+)'|([^\\s>]+))", 'gi'), + ), + ]; + + return matches + .map((match) => match[1] || match[2] || match[3] || '') + .map((value) => value.trim()) + .filter(Boolean); +} + +function addSchemaTypeValue(value, types) { + if (!value) { + return; + } + + const typeCandidates = Array.isArray(value) + ? value + : String(value) + .split(/\s+/) + .map((entry) => entry.trim()) + .filter(Boolean); + + typeCandidates.forEach((candidate) => { + const normalizedCandidate = normalizeSchemaTypeLabel(candidate); + + if (normalizedCandidate) { + types.add(normalizedCandidate); + } + }); +} + function extractSchemaSummary(html) { + const resolvedHtml = String(html || ''); const jsonLdMatches = [ - ...html.matchAll( + ...resolvedHtml.matchAll( /]*type=["']application\/ld\+json["'][^>]*>([\s\S]*?)<\/script>/gi, ), ]; @@ -120,19 +273,36 @@ function extractSchemaSummary(html) { } }); - const microdataCount = (html.match(/\sitemscope(?:\s|=|>)/gi) || []).length; - const rdfaTypeofCount = (html.match(/\stypeof\s*=\s*(["']).*?\1/gi) || []).length; - const rdfaPropertyCount = (html.match(/\sproperty\s*=\s*(["']).*?\1/gi) || []).length; - const rdfaVocabCount = (html.match(/\svocab\s*=\s*(["']).*?\1/gi) || []).length; + const microdataTypes = new Set(); + const rdfaTypes = new Set(); + const microdataCount = (resolvedHtml.match(/\sitemscope(?:\s|=|>)/gi) || []).length; + const rdfaTypeofCount = (resolvedHtml.match(/\stypeof\s*=\s*(?:"[^"]+"|'[^']+'|[^\s>]+)/gi) || []).length; + const rdfaPropertyCount = (resolvedHtml.match(/\sproperty\s*=\s*(?:"[^"]+"|'[^']+'|[^\s>]+)/gi) || []).length; + const rdfaVocabCount = (resolvedHtml.match(/\svocab\s*=\s*(?:"[^"]+"|'[^']+'|[^\s>]+)/gi) || []).length; const rdfaCount = Math.max( rdfaTypeofCount, rdfaPropertyCount, rdfaVocabCount, ); + extractAttributeValues(resolvedHtml, 'itemtype').forEach((itemType) => { + addSchemaTypeValue(itemType, microdataTypes); + }); + + extractAttributeValues(resolvedHtml, 'typeof').forEach((typeOfValue) => { + addSchemaTypeValue(typeOfValue, rdfaTypes); + }); + + const detectedTypes = new Set([ + ...Array.from(jsonLdTypes), + ...Array.from(microdataTypes), + ...Array.from(rdfaTypes), + ]); + return { hasStructuredData: jsonLdMatches.length > 0 || microdataCount > 0 || rdfaCount > 0, + types: Array.from(detectedTypes), jsonLd: { count: jsonLdMatches.length, types: Array.from(jsonLdTypes), @@ -141,10 +311,12 @@ function extractSchemaSummary(html) { microdata: { count: microdataCount, detected: microdataCount > 0, + types: Array.from(microdataTypes), }, rdfa: { count: rdfaCount, detected: rdfaCount > 0, + types: Array.from(rdfaTypes), }, }; } @@ -237,6 +409,339 @@ function detectPlatform(html, headers, analyzedUrl) { }; } +function getConfidenceLabel(score) { + if (score >= 3) { + return 'high'; + } + + if (score >= 2) { + return 'medium'; + } + + return 'low'; +} + +function getConfidenceRank(confidence) { + if (confidence === 'high') { + return 3; + } + + if (confidence === 'medium') { + return 2; + } + + return 1; +} + +function getWordPressPluginPriority(pluginKey) { + const pluginDefinition = WORDPRESS_PLUGIN_DEFINITIONS.find((definition) => definition.key === pluginKey); + + return pluginDefinition?.priority || 0; +} + +function getWordPressTargetSchemaTypes(schemaTypes = [], pageSignals = {}) { + const targetTypes = new Set(); + + if (hasSchemaType(schemaTypes, ['LocalBusiness']) || pageSignals.hasLocalBusinessHints) { + targetTypes.add('LocalBusiness'); + } else { + targetTypes.add('Organization'); + } + + targetTypes.add('WebSite'); + + if (hasSchemaType(schemaTypes, ['Product', 'Offer', 'AggregateRating', 'Review']) || pageSignals.hasProductHints) { + targetTypes.add('Product'); + targetTypes.add('Offer'); + } + + if (hasSchemaType(schemaTypes, ['Article', 'BlogPosting', 'NewsArticle']) || pageSignals.hasBlogHints) { + targetTypes.add('BlogPosting'); + } + + if (hasSchemaType(schemaTypes, ['FAQPage']) || pageSignals.hasFaqHints) { + targetTypes.add('FAQPage'); + } + + if (hasSchemaType(schemaTypes, ['Service']) || pageSignals.hasServiceHints) { + targetTypes.add('Service'); + } + + return Array.from(targetTypes); +} + +function buildWordPressPluginRecommendations(plugins, schemaTypes = [], pageSignals = {}) { + const pluginList = Array.isArray(plugins) ? plugins : []; + const pluginMap = new Map(pluginList.map((plugin) => [plugin.key, plugin])); + const targetTypes = getWordPressTargetSchemaTypes(schemaTypes, pageSignals); + const sitewideTypes = targetTypes.filter((type) => ['Organization', 'LocalBusiness', 'WebSite', 'BlogPosting', 'FAQPage', 'Service'].includes(type)); + const productTypes = targetTypes.filter((type) => ['Product', 'Offer'].includes(type)); + const recommendations = []; + const seoPlugins = pluginList.filter((plugin) => plugin.category === 'seo'); + const primarySeoPlugin = seoPlugins + .slice() + .sort((left, right) => getWordPressPluginPriority(right.key) - getWordPressPluginPriority(left.key))[0]; + + if (seoPlugins.length > 1) { + recommendations.push({ + key: 'seo-consolidation', + label: 'SEO plugin consolidation', + category: 'seo', + priority: 'high', + title: 'Choose a single SEO/schema owner before changing markup', + summary: `Detected SEO plugins (${seoPlugins.map((plugin) => plugin.label).join(', ')}) can overlap on Organization, WebSite, Article, and FAQ output.`, + recommendedApproach: 'Disable overlapping schema modules or remove duplicate output before implementing new JSON-LD in theme files.', + applicableSchemaTypes: sitewideTypes.length > 0 ? sitewideTypes : ['Organization', 'WebSite'], + }); + } else if (primarySeoPlugin) { + recommendations.push({ + key: primarySeoPlugin.key, + label: primarySeoPlugin.label, + category: primarySeoPlugin.category, + priority: 'high', + title: `Use ${primarySeoPlugin.label} for sitewide schema changes first`, + summary: `${primarySeoPlugin.label} is the strongest candidate to own homepage and editorial schema before custom theme code is added.`, + recommendedApproach: `Implement ${sitewideTypes.join(', ') || 'sitewide'} changes through ${primarySeoPlugin.label} settings, schema modules, or documented filters before hardcoding JSON-LD.`, + applicableSchemaTypes: sitewideTypes.length > 0 ? sitewideTypes : ['Organization', 'WebSite'], + }); + } + + if (pluginMap.has('woocommerce')) { + recommendations.push({ + key: 'woocommerce', + label: 'WooCommerce', + category: 'commerce', + priority: productTypes.length > 0 ? 'high' : 'medium', + title: 'Handle product schema in WooCommerce templates or hooks', + summary: 'WooCommerce commonly outputs Product and Offer markup, so product-level schema changes should be audited in plugin templates before theme JSON-LD is added.', + recommendedApproach: productTypes.length > 0 + ? 'Review existing WooCommerce product schema first, then extend product/offer data with WooCommerce hooks or template overrides only where needed.' + : 'If product pages are added later, keep WooCommerce as the first place to audit before adding custom Product schema snippets.', + applicableSchemaTypes: productTypes.length > 0 ? productTypes : ['Product', 'Offer'], + }); + } + + if (pluginMap.has('acf')) { + recommendations.push({ + key: 'acf', + label: 'Advanced Custom Fields', + category: 'fields', + priority: 'medium', + title: 'Use ACF fields as the schema data source', + summary: 'ACF is a good place to store repeatable schema inputs so FAQ, Service, and editorial markup stay editable for content teams.', + recommendedApproach: 'Map ACF fields to JSON-LD generation in a theme helper or lightweight custom plugin instead of hardcoding page-specific values in templates.', + applicableSchemaTypes: targetTypes.filter((type) => ['FAQPage', 'Service', 'BlogPosting', 'LocalBusiness'].includes(type)), + }); + } + + const builderPlugins = pluginList.filter((plugin) => plugin.category === 'builder'); + if (builderPlugins.length > 0) { + recommendations.push({ + key: 'builder-templates', + label: builderPlugins.map((plugin) => plugin.label).join(' + '), + category: 'builder', + priority: 'medium', + title: 'Confirm builder templates before inserting page-level schema', + summary: `${builderPlugins.map((plugin) => plugin.label).join(' and ')} may control landing-page markup, so schema placement should be validated in the builder/template layer.`, + recommendedApproach: 'Check builder-controlled templates, theme hooks, and reusable sections before injecting FAQPage or Service markup into a single template file.', + applicableSchemaTypes: targetTypes.filter((type) => ['FAQPage', 'Service', 'BlogPosting'].includes(type)), + }); + } + + if (recommendations.length === 0) { + recommendations.push({ + key: 'theme-custom', + label: 'Theme / custom code', + category: 'custom', + priority: 'medium', + title: 'Implement schema in theme or custom plugin code', + summary: 'No major schema-owning WordPress plugin was detected, so the theme or a custom plugin is the most likely implementation layer.', + recommendedApproach: 'Use reusable helpers, theme hooks, or a lightweight custom plugin to generate schema from central data instead of duplicating snippets across templates.', + applicableSchemaTypes: targetTypes, + }); + } + + return recommendations + .map((recommendation) => ({ + ...recommendation, + applicableSchemaTypes: Array.from(new Set(recommendation.applicableSchemaTypes || [])).filter(Boolean), + })) + .slice(0, 4); +} + +function buildWordPressDuplicateRisk(plugins, schemaTypes = [], pageSignals = {}) { + const pluginList = Array.isArray(plugins) ? plugins : []; + const seoPlugins = pluginList.filter((plugin) => plugin.category === 'seo'); + const builderPlugins = pluginList.filter((plugin) => plugin.category === 'builder'); + const hasWooCommerce = pluginList.some((plugin) => plugin.key === 'woocommerce'); + const hasProductSchema = hasSchemaType(schemaTypes, ['Product', 'Offer', 'AggregateRating', 'Review']) || pageSignals.hasProductHints; + const warnings = []; + let level = 'low'; + + if (seoPlugins.length > 1) { + level = 'high'; + warnings.push(`Multiple SEO/schema plugins were detected (${seoPlugins.map((plugin) => plugin.label).join(', ')}), which often leads to duplicate Organization, WebSite, Article, or FAQ graphs.`); + } + + if (seoPlugins.length > 0 && hasWooCommerce && hasProductSchema) { + if (level !== 'high') { + level = 'medium'; + } + warnings.push('A sitewide SEO plugin and WooCommerce may both influence product-related schema, so Product, Offer, Review, or AggregateRating markup should be audited for overlap.'); + } + + if (builderPlugins.length > 0 && seoPlugins.length > 0 && (pageSignals.hasFaqHints || pageSignals.hasServiceHints || pageSignals.hasBlogHints)) { + if (level === 'low') { + level = 'medium'; + } + warnings.push('Builder-managed landing pages can reintroduce page-level FAQ, Service, or Article markup on top of SEO-plugin output if custom snippets were added previously.'); + } + + if (warnings.length === 0) { + return { + level: 'low', + label: 'Low duplicate-schema risk', + summary: 'The detected WordPress stack does not show strong duplicate-schema signals yet, but existing snippets should still be audited before new markup is added.', + warnings: [], + affectedPlugins: pluginList.map((plugin) => plugin.label), + }; + } + + return { + level, + label: `${level.charAt(0).toUpperCase()}${level.slice(1)} duplicate-schema risk`, + summary: level === 'high' + ? 'More than one schema-capable layer appears active, so duplicate graphs are likely unless ownership is consolidated first.' + : 'At least two WordPress layers may influence schema output, so existing markup should be audited before new snippets are shipped.', + warnings, + affectedPlugins: pluginList.map((plugin) => plugin.label), + }; +} + +function buildWordPressSchemaOwnership(plugins, schemaTypes = []) { + const pluginList = Array.isArray(plugins) ? plugins : []; + const seoPlugins = pluginList.filter((plugin) => plugin.category === 'seo'); + const hasWooCommerce = pluginList.some((plugin) => plugin.key === 'woocommerce'); + const hasProductSchema = hasSchemaType(schemaTypes, ['Product', 'Offer', 'AggregateRating', 'Review']); + const builderPlugins = pluginList.filter((plugin) => plugin.category === 'builder'); + const notes = []; + + if (seoPlugins.length > 1) { + notes.push('Audit the existing schema output before adding new markup so multiple SEO plugins do not duplicate the graph.'); + } + + if (hasWooCommerce) { + notes.push('Review WooCommerce product and offer markup before adding custom Product JSON-LD.'); + } + + if (builderPlugins.length > 0) { + notes.push('Page-builder templates may control where schema blocks should be inserted on landing pages and service pages.'); + } + + if (seoPlugins.length > 1) { + return { + mode: 'mixed', + label: 'Multiple schema-capable plugins detected', + summary: 'More than one SEO/schema plugin appears active. Consolidate schema ownership before adding new output to avoid duplicate graphs.', + recommendedImplementation: 'Choose one schema owner first, then extend that plugin or remove overlapping output before coding theme-level JSON-LD.', + notes, + }; + } + + if (seoPlugins.length === 1 && hasWooCommerce && hasProductSchema) { + return { + mode: 'mixed', + label: `${seoPlugins[0].label} + WooCommerce likely share schema ownership`, + summary: `${seoPlugins[0].label} likely controls sitewide schema while WooCommerce likely controls product-level markup. Extend those layers before hardcoding duplicate schema.`, + recommendedImplementation: `Use ${seoPlugins[0].label} for organization/sitewide schema and review WooCommerce product templates or hooks for product schema changes.`, + notes, + }; + } + + if (seoPlugins.length === 1) { + return { + mode: 'plugin_managed', + label: `${seoPlugins[0].label} likely manages primary schema output`, + summary: `${seoPlugins[0].label} appears to be the main schema layer for this WordPress site. Prefer plugin settings, custom fields, or documented filters before injecting theme JSON-LD.`, + recommendedImplementation: `Implement schema changes through ${seoPlugins[0].label} first, and only fall back to theme or custom code for unsupported schema types.`, + notes, + }; + } + + if (hasWooCommerce && hasProductSchema) { + return { + mode: 'plugin_managed', + label: 'WooCommerce likely manages product schema', + summary: 'WooCommerce signals and product schema were detected together, so product-level structured data may already be plugin-managed.', + recommendedImplementation: 'Audit existing WooCommerce product markup before adding custom Product or Offer schema in theme files.', + notes, + }; + } + + return { + mode: 'theme_or_custom', + label: 'Theme or custom code likely manages schema output', + summary: 'No major WordPress SEO/schema plugin was detected. Schema changes may need to be implemented in the theme, a custom plugin, or custom fields.', + recommendedImplementation: 'Plan for theme hooks, a lightweight custom plugin, or custom field-driven JSON-LD injection for the recommended schema types.', + notes, + }; +} + +function detectWordPressSignals(html, platform, schema, pageSignals = {}) { + const lowerHtml = String(html || '').toLowerCase(); + const hasWordPressSignals = + platform?.detected === 'wordpress' + || lowerHtml.includes('wp-content') + || lowerHtml.includes('wp-includes') + || lowerHtml.includes('wp-json') + || lowerHtml.includes('content="wordpress'); + + if (!hasWordPressSignals) { + return { + detected: false, + plugins: [], + schemaOwnership: null, + pluginRecommendations: [], + duplicateRisk: null, + }; + } + + const plugins = WORDPRESS_PLUGIN_DEFINITIONS + .map((definition) => { + const matchedSignals = definition.patterns.filter((pattern) => lowerHtml.includes(pattern)); + + if (matchedSignals.length === 0) { + return null; + } + + return { + key: definition.key, + label: definition.label, + category: definition.category, + confidence: getConfidenceLabel(matchedSignals.length), + evidence: matchedSignals.slice(0, 4), + implementationHint: buildWordPressSchemaOwnership([ + { + key: definition.key, + label: definition.label, + category: definition.category, + }, + ], schema?.types || []).recommendedImplementation, + priority: definition.priority, + }; + }) + .filter(Boolean) + .sort((left, right) => (right.priority || 0) - (left.priority || 0)); + + return { + detected: true, + plugins, + schemaOwnership: buildWordPressSchemaOwnership(plugins, schema?.types || []), + pluginRecommendations: buildWordPressPluginRecommendations(plugins, schema?.types || [], pageSignals), + duplicateRisk: buildWordPressDuplicateRisk(plugins, schema?.types || [], pageSignals), + }; +} + function isHtmlLikeResponse(response) { const contentType = String(response?.headers?.['content-type'] || '').toLowerCase(); @@ -530,6 +1035,7 @@ async function fetchAnalyzedPage(pageUrl, allowedHostnames) { pageTitle, platform, ); + const wordpress = detectWordPressSignals(html, platform, schema, pageSignals); return { requestedUrl: pageUrl, @@ -539,6 +1045,7 @@ async function fetchAnalyzedPage(pageUrl, allowedHostnames) { html, platform, schema, + wordpress, pageSignals, discoveredLinks: extractInternalLinks( html, @@ -577,6 +1084,7 @@ function analyzeFetchedPage({ resolvedPageTitle, platform, ); + const wordpress = detectWordPressSignals(resolvedHtml, platform, schema, pageSignals); const normalizedLinks = Array.isArray(discoveredLinks) ? Array.from( new Set( @@ -599,6 +1107,7 @@ function analyzeFetchedPage({ html: resolvedHtml, platform, schema, + wordpress, pageSignals, discoveredLinks: normalizedLinks, }; @@ -781,8 +1290,100 @@ async function crawlPages(baseUrl, requestedPages, crawlTargets = {}) { }; } + +function buildAggregateWordPress(pageAnalyses, aggregateSchema, aggregateSignals = {}) { + const pluginMap = new Map(); + let detectedPageCount = 0; + + pageAnalyses.forEach((page) => { + const wordpress = page.wordpress || {}; + + if (!wordpress.detected) { + return; + } + + detectedPageCount += 1; + + (wordpress.plugins || []).forEach((plugin) => { + const existing = pluginMap.get(plugin.key); + const mergedEvidence = Array.from(new Set([ + ...(existing?.evidence || []), + ...(plugin.evidence || []), + ])).slice(0, 5); + const mergedPages = Array.from(new Set([ + ...(existing?.pageUrls || []), + page.analyzedUrl, + ])).filter(Boolean); + const nextPlugin = { + key: plugin.key, + label: plugin.label, + category: plugin.category, + confidence: plugin.confidence, + implementationHint: plugin.implementationHint, + evidence: mergedEvidence, + pageUrls: mergedPages, + }; + + if (!existing) { + pluginMap.set(plugin.key, nextPlugin); + return; + } + + if (getConfidenceRank(plugin.confidence) >= getConfidenceRank(existing.confidence)) { + pluginMap.set(plugin.key, nextPlugin); + return; + } + + pluginMap.set(plugin.key, { + ...existing, + evidence: mergedEvidence, + pageUrls: mergedPages, + }); + }); + }); + + const plugins = Array.from(pluginMap.values()) + .sort((left, right) => { + const confidenceDelta = getConfidenceRank(right.confidence) - getConfidenceRank(left.confidence); + + if (confidenceDelta !== 0) { + return confidenceDelta; + } + + return left.label.localeCompare(right.label); + }) + .map((plugin) => ({ + ...plugin, + pageCount: plugin.pageUrls.length, + pageUrls: plugin.pageUrls.slice(0, 5), + })); + + if (detectedPageCount === 0) { + return { + detected: false, + detectedPageCount: 0, + plugins: [], + schemaOwnership: null, + pluginRecommendations: [], + duplicateRisk: null, + }; + } + + return { + detected: true, + detectedPageCount, + plugins, + schemaOwnership: buildWordPressSchemaOwnership(plugins, aggregateSchema?.types || []), + pluginRecommendations: buildWordPressPluginRecommendations(plugins, aggregateSchema?.types || [], aggregateSignals), + duplicateRisk: buildWordPressDuplicateRisk(plugins, aggregateSchema?.types || [], aggregateSignals), + }; +} + function buildAggregateSchema(pageAnalyses) { + const allTypes = new Set(); const jsonLdTypes = new Set(); + const microdataTypes = new Set(); + const rdfaTypes = new Set(); const invalidBlocks = []; let jsonLdCount = 0; let microdataCount = 0; @@ -795,7 +1396,10 @@ function buildAggregateSchema(pageAnalyses) { microdataCount += schema.microdata?.count || 0; rdfaCount += schema.rdfa?.count || 0; + (schema.types || []).forEach((typeName) => allTypes.add(typeName)); (schema.jsonLd?.types || []).forEach((typeName) => jsonLdTypes.add(typeName)); + (schema.microdata?.types || []).forEach((typeName) => microdataTypes.add(typeName)); + (schema.rdfa?.types || []).forEach((typeName) => rdfaTypes.add(typeName)); (schema.jsonLd?.invalidBlocks || []).forEach((block) => { invalidBlocks.push({ ...block, @@ -806,6 +1410,7 @@ function buildAggregateSchema(pageAnalyses) { return { hasStructuredData: pageAnalyses.some((page) => page.schema?.hasStructuredData), + types: Array.from(allTypes), jsonLd: { count: jsonLdCount, types: Array.from(jsonLdTypes), @@ -814,10 +1419,12 @@ function buildAggregateSchema(pageAnalyses) { microdata: { count: microdataCount, detected: microdataCount > 0, + types: Array.from(microdataTypes), }, rdfa: { count: rdfaCount, detected: rdfaCount > 0, + types: Array.from(rdfaTypes), }, }; } @@ -832,21 +1439,25 @@ function buildAggregateSignals(pageAnalyses) { hasProductHints: accumulator.hasProductHints || Boolean(pageSignals.hasProductHints), hasLocalBusinessHints: accumulator.hasLocalBusinessHints || Boolean(pageSignals.hasLocalBusinessHints), + hasServiceHints: accumulator.hasServiceHints || Boolean(pageSignals.hasServiceHints), faqPages: accumulator.faqPages + (pageSignals.hasFaqHints ? 1 : 0), blogPages: accumulator.blogPages + (pageSignals.hasBlogHints ? 1 : 0), productPages: accumulator.productPages + (pageSignals.hasProductHints ? 1 : 0), localBusinessPages: accumulator.localBusinessPages + (pageSignals.hasLocalBusinessHints ? 1 : 0), + servicePages: accumulator.servicePages + (pageSignals.hasServiceHints ? 1 : 0), }; }, { hasFaqHints: false, hasBlogHints: false, hasProductHints: false, hasLocalBusinessHints: false, + hasServiceHints: false, faqPages: 0, blogPages: 0, productPages: 0, localBusinessPages: 0, + servicePages: 0, }); } @@ -894,6 +1505,7 @@ function buildAggregateAnalysis({ const finishedAt = new Date(); const aggregateSchema = buildAggregateSchema(pageAnalyses); const aggregateSignals = buildAggregateSignals(pageAnalyses); + const aggregateWordPress = buildAggregateWordPress(pageAnalyses, aggregateSchema, aggregateSignals); const crawlTargetSummary = summarizeCrawlTargets(crawlTargets); return { @@ -908,6 +1520,7 @@ function buildAggregateAnalysis({ matchedSignals: [], }, schema: aggregateSchema, + wordpress: aggregateWordPress, recommendationCount: 0, crawlPlan: { requestedPages, @@ -932,7 +1545,17 @@ function buildAggregateAnalysis({ title: page.pageTitle, statusCode: page.statusCode, hasStructuredData: Boolean(page.schema?.hasStructuredData), + schemaTypes: page.schema?.types || [], jsonLdTypes: page.schema?.jsonLd?.types || [], + wordpress: { + detected: Boolean(page.wordpress?.detected), + plugins: (page.wordpress?.plugins || []).map((plugin) => ({ + key: plugin.key, + label: plugin.label, + category: plugin.category, + confidence: plugin.confidence, + })), + }, })), failedPages, aggregateSignals, @@ -962,9 +1585,18 @@ function buildFailureAnalysis(normalizedUrl, error, firecrawl, provider = 'inter }, schema: { hasStructuredData: false, + types: [], jsonLd: { count: 0, types: [], invalidBlocks: [] }, - microdata: { count: 0, detected: false }, - rdfa: { count: 0, detected: false }, + microdata: { count: 0, detected: false, types: [] }, + rdfa: { count: 0, detected: false, types: [] }, + }, + wordpress: { + detected: false, + detectedPageCount: 0, + plugins: [], + schemaOwnership: null, + pluginRecommendations: [], + duplicateRisk: null, }, firecrawl, crawlPlan: { @@ -1020,6 +1652,13 @@ function inferPageSignals(html, analyzedUrl, pageTitle, platform) { combined.includes('visit us') || combined.includes('call us') || combined.includes('directions'), + hasServiceHints: + combined.includes('service') || + combined.includes('services') || + combined.includes('book now') || + combined.includes('schedule consultation') || + combined.includes('get quote') || + combined.includes('request a quote'), }; } @@ -1170,7 +1809,7 @@ function buildRecommendationCode({ baseUrl, siteName, schemaType, pageScope }) { function buildRecommendations({ baseUrl, siteName, analysis, html, pageAnalyses = [] }) { const recommendationList = []; - const schemaTypes = analysis?.schema?.jsonLd?.types || []; + const schemaTypes = analysis?.schema?.types || analysis?.schema?.jsonLd?.types || []; const aggregateSignals = analysis?.aggregateSignals || {}; const pageSignals = pageAnalyses.length > 0 ? aggregateSignals @@ -1339,6 +1978,31 @@ function buildRecommendations({ baseUrl, siteName, analysis, html, pageAnalyses }); } + if ( + pageSignals.hasServiceHints + && !hasSchemaType(schemaTypes, ['Service']) + ) { + recommendationList.push({ + title: 'Add Service schema on service landing pages', + recommendation_type: 'missing_page_type', + schema_type: 'Service', + page_scope: 'service-pages', + priority: 'medium', + reason: + pageAnalyses.length > 1 + ? `Service-oriented signals appeared across ${pageSignals.servicePages || 1} analyzed page${(pageSignals.servicePages || 1) === 1 ? '' : 's'}, but Service schema was not detected.` + : 'The analyzed page appears to describe a service offering, but Service schema was not detected.', + expected_impact: + 'Helps search engines and AI systems understand the services offered, their provider, and their audience.', + suggested_schema: buildRecommendationCode({ + baseUrl, + siteName, + schemaType: 'Service', + pageScope: 'service', + }), + }); + } + return recommendationList.slice(0, PREVIEW_LIMIT); } @@ -1421,6 +2085,7 @@ function buildExportPayload({ site, analysis, recommendations }) { const exportableRecommendations = (recommendations || []).filter( (recommendation) => recommendation.suggested_schema, ); + const wordpressSummary = analysis?.wordpress || {}; const sections = exportableRecommendations.map((recommendation) => { return [ @@ -1435,12 +2100,22 @@ function buildExportPayload({ site, analysis, recommendations }) { .join('\n'); }); + const wordpressGuidanceLines = wordpressSummary.detected ? [ + wordpressSummary.schemaOwnership?.label ? `WordPress ownership: ${wordpressSummary.schemaOwnership.label}` : '', + wordpressSummary.schemaOwnership?.recommendedImplementation ? `Implementation guidance: ${wordpressSummary.schemaOwnership.recommendedImplementation}` : '', + wordpressSummary.duplicateRisk?.label ? `Duplicate risk: ${wordpressSummary.duplicateRisk.label}` : '', + ...(wordpressSummary.duplicateRisk?.warnings || []).map((warning) => `- ${warning}`), + ...(wordpressSummary.pluginRecommendations || []).map((recommendation) => `${recommendation.label}: ${recommendation.recommendedApproach}`), + ].filter(Boolean) : []; + const content = [ `Schema recommendations for ${site?.name || hostname}`, `Base URL: ${site?.base_url || ''}`, analysis?.pageTitle ? `Analyzed page: ${analysis.pageTitle}` : '', analysis?.fetchedAt ? `Analyzed at: ${analysis.fetchedAt}` : '', - '', + wordpressGuidanceLines.length > 0 ? 'WordPress implementation notes:' : '', + ...wordpressGuidanceLines, + wordpressGuidanceLines.length > 0 ? '' : '', ...sections, ] .filter(Boolean) diff --git a/frontend/src/pages/sites/analyzer.tsx b/frontend/src/pages/sites/analyzer.tsx index 1705efc..1cb9595 100644 --- a/frontend/src/pages/sites/analyzer.tsx +++ b/frontend/src/pages/sites/analyzer.tsx @@ -54,31 +54,23 @@ type AnalysisPayload = { title?: string | null; statusCode?: number | null; hasStructuredData?: boolean; + schemaTypes?: string[]; jsonLdTypes?: string[]; + wordpress?: { + detected?: boolean; + plugins?: { + key?: string; + label?: string; + category?: string; + confidence?: string; + }[]; + }; }[]; failedPages?: { url?: string; error?: string; }[]; entitlements?: Entitlements; - firecrawl?: { - provider?: string; - enabled?: boolean; - configured?: boolean; - mode?: string; - status?: string; - wouldHandleJavascript?: boolean; - wouldHandleSitemapDiscovery?: boolean; - availableForCurrentUser?: boolean; - shouldUseFirecrawl?: boolean; - usePaidOnly?: boolean; - currentProvider?: string; - crawlId?: string | null; - crawlStatus?: string | null; - creditsUsed?: number; - fallbackReason?: string; - message?: string; - }; platform?: { detected?: string; label?: string; @@ -86,6 +78,7 @@ type AnalysisPayload = { }; schema?: { hasStructuredData?: boolean; + types?: string[]; jsonLd?: { count?: number; types?: string[]; @@ -94,12 +87,52 @@ type AnalysisPayload = { microdata?: { count?: number; detected?: boolean; + types?: string[]; }; rdfa?: { count?: number; detected?: boolean; + types?: string[]; }; }; + wordpress?: { + detected?: boolean; + detectedPageCount?: number; + plugins?: { + key?: string; + label?: string; + category?: string; + confidence?: string; + pageCount?: number; + pageUrls?: string[]; + evidence?: string[]; + implementationHint?: string; + }[]; + schemaOwnership?: { + mode?: string; + label?: string; + summary?: string; + recommendedImplementation?: string; + notes?: string[]; + } | null; + pluginRecommendations?: { + key?: string; + label?: string; + category?: string; + priority?: string; + title?: string; + summary?: string; + recommendedApproach?: string; + applicableSchemaTypes?: string[]; + }[]; + duplicateRisk?: { + level?: string; + label?: string; + summary?: string; + warnings?: string[]; + affectedPlugins?: string[]; + } | null; + }; error?: string; }; @@ -834,7 +867,22 @@ const SchemaAnalyzerPage = () => { const appliedExcludeTargets = crawlPlan?.excludeTargets || draftExcludeTargets; const analyzedPages = report?.analysis?.pages || []; const failedPages = report?.analysis?.failedPages || []; - const jsonLdTypes = report?.analysis?.schema?.jsonLd?.types || []; + const detectedSchemaTypes = report?.analysis?.schema?.types || report?.analysis?.schema?.jsonLd?.types || []; + const wordpressAnalysis = report?.analysis?.wordpress || null; + const detectedWordPressPlugins = wordpressAnalysis?.plugins || []; + const wordpressSchemaOwnership = wordpressAnalysis?.schemaOwnership || null; + const wordpressPluginRecommendations = wordpressAnalysis?.pluginRecommendations || []; + const wordpressDuplicateRisk = wordpressAnalysis?.duplicateRisk || null; + const wordpressDuplicateRiskToneClassName = wordpressDuplicateRisk?.level === 'high' + ? 'border-rose-200 bg-rose-50 text-rose-800 dark:border-rose-500/40 dark:bg-rose-500/10 dark:text-rose-100' + : wordpressDuplicateRisk?.level === 'medium' + ? 'border-amber-200 bg-amber-50 text-amber-900 dark:border-amber-500/40 dark:bg-amber-500/10 dark:text-amber-100' + : 'border-emerald-200 bg-emerald-50 text-emerald-800 dark:border-emerald-500/40 dark:bg-emerald-500/10 dark:text-emerald-100'; + const wordpressDuplicateRiskBadgeClassName = wordpressDuplicateRisk?.level === 'high' + ? 'bg-rose-100 text-rose-700 dark:bg-rose-500/10 dark:text-rose-200' + : wordpressDuplicateRisk?.level === 'medium' + ? 'bg-amber-100 text-amber-700 dark:bg-amber-500/10 dark:text-amber-200' + : 'bg-emerald-100 text-emerald-700 dark:bg-emerald-500/10 dark:text-emerald-200'; const invalidJsonLdBlocks = report?.analysis?.schema?.jsonLd?.invalidBlocks || []; const hasTargetingRules = appliedIncludeTargets.length > 0 || appliedExcludeTargets.length > 0; const selectedPlatformLabel = PLATFORM_OPTIONS.find( @@ -974,9 +1022,9 @@ const SchemaAnalyzerPage = () => { ? 'No analyzed pages are missing structured data for this run.' : 'No page-level results are available yet for this analysis run.'; const step4SchemaTypes = React.useMemo(() => Array.from(new Set([ - ...jsonLdTypes, + ...detectedSchemaTypes, ...recommendations.map((recommendation) => recommendation.schema_type || '').filter(Boolean), - ])).slice(0, 8), [jsonLdTypes, recommendations]); + ])).slice(0, 8), [detectedSchemaTypes, recommendations]); const step4PrimarySchemaType = step4SchemaTypes[0] || 'Organization'; const step4PrimaryPageUrl = analyzedPages[0]?.url || report?.analysis?.analyzedUrl || report?.site?.base_url || trimmedUrl || 'https://example.com'; const platformPreviewArtifacts = React.useMemo(() => { @@ -1225,7 +1273,7 @@ const SchemaAnalyzerPage = () => { return analyzedPages.map((page, index) => { const mappedDeliverable = platformFinalDeliverables[index % platformFinalDeliverables.length] || platformFinalDeliverables[0]; const mappedArtifact = implementationArtifacts[index % implementationArtifacts.length] || platformPreviewArtifacts[0]; - const schemaType = page.jsonLdTypes?.[0] || step4SchemaTypes[index % step4SchemaTypes.length] || step4PrimarySchemaType; + const schemaType = page.schemaTypes?.[0] || page.jsonLdTypes?.[0] || step4SchemaTypes[index % step4SchemaTypes.length] || step4PrimarySchemaType; const hasExistingStructuredData = Boolean(page.hasStructuredData); return { @@ -1408,6 +1456,24 @@ add_action('wp_head', function () { failedPages: failedPagesCount, exactRequestedPageTargetMet: requestedPageTargetMet, structuredDataTypes: step4SchemaTypes, + wordpress: wordpressAnalysis?.detected ? { + detected: true, + detectedPlugins: detectedWordPressPlugins.map((plugin) => plugin.label || plugin.key || 'Plugin'), + schemaOwnership: wordpressSchemaOwnership?.label || null, + implementationGuidance: wordpressSchemaOwnership?.recommendedImplementation || null, + duplicateRisk: wordpressDuplicateRisk ? { + level: wordpressDuplicateRisk.level || null, + label: wordpressDuplicateRisk.label || null, + summary: wordpressDuplicateRisk.summary || null, + warnings: wordpressDuplicateRisk.warnings || [], + } : null, + pluginRecommendations: wordpressPluginRecommendations.map((recommendation) => ({ + plugin: recommendation.label || recommendation.key || 'Plugin', + title: recommendation.title || null, + recommendedApproach: recommendation.recommendedApproach || null, + applicableSchemaTypes: recommendation.applicableSchemaTypes || [], + })), + } : null, notice: report?.analysis?.notice || null, }, pages: analyzedPages.map((page) => ({ @@ -1415,7 +1481,8 @@ add_action('wp_head', function () { title: page.title || null, statusCode: page.statusCode || null, hasStructuredData: Boolean(page.hasStructuredData), - jsonLdTypes: page.jsonLdTypes || [], + schemaTypes: page.schemaTypes || page.jsonLdTypes || [], + wordpressPlugins: (page.wordpress?.plugins || []).map((plugin) => plugin.label || plugin.key || 'Plugin'), })), finalDeliverables: platformFinalDeliverables.map((deliverable) => ({ id: deliverable.id, @@ -1470,6 +1537,12 @@ add_action('wp_head', function () { selectedPlatformMeta.payloadLabel, step4SchemaTypes, trimmedUrl, + wordpressAnalysis?.detected, + detectedWordPressPlugins, + wordpressSchemaOwnership?.label, + wordpressSchemaOwnership?.recommendedImplementation, + wordpressDuplicateRisk, + wordpressPluginRecommendations, ]); const platformOutputPreviewJson = React.useMemo( () => JSON.stringify(platformOutputPreviewPayload, null, 2), @@ -1483,6 +1556,18 @@ add_action('wp_head', function () { `Platform package: ${selectedPlatformMeta.payloadLabel}`, `Implementation target: ${selectedPlatformMeta.developerDestination}`, step4SchemaTypes.length > 0 ? `Schema types: ${step4SchemaTypes.join(', ')}` : 'Schema types: To be determined from recommendations', + selectedPlatform === 'wordpress' && detectedWordPressPlugins.length > 0 + ? `Detected WordPress plugins: ${detectedWordPressPlugins.map((plugin) => plugin.label || plugin.key || 'Plugin').join(', ')}` + : '', + selectedPlatform === 'wordpress' && wordpressSchemaOwnership?.recommendedImplementation + ? `WordPress implementation guidance: ${wordpressSchemaOwnership.recommendedImplementation}` + : '', + selectedPlatform === 'wordpress' && wordpressDuplicateRisk?.label + ? `Duplicate-schema risk: ${wordpressDuplicateRisk.label}${wordpressDuplicateRisk.summary ? ` — ${wordpressDuplicateRisk.summary}` : ''}` + : '', + selectedPlatform === 'wordpress' && wordpressPluginRecommendations.length > 0 + ? `Plugin implementation recommendations: ${wordpressPluginRecommendations.slice(0, 2).map((recommendation) => `${recommendation.label || recommendation.key || 'Plugin'} — ${recommendation.recommendedApproach || recommendation.summary || 'Review detected plugin output before coding.'}`).join(' | ')}` + : '', '', 'Demo workflow preview:', ...selectedPlatformMeta.steps.map((step, index) => `${index + 1}. ${step}`), @@ -1494,8 +1579,14 @@ add_action('wp_head', function () { selectedPlatformMeta.developerDestination, selectedPlatformMeta.payloadLabel, selectedPlatformMeta.steps, + selectedPlatform, step4SchemaTypes, trimmedUrl, + detectedWordPressPlugins, + wordpressSchemaOwnership?.recommendedImplementation, + wordpressDuplicateRisk?.label, + wordpressDuplicateRisk?.summary, + wordpressPluginRecommendations, ]); const deliverySummaryCards = [ { @@ -2391,11 +2482,11 @@ https://example.com/pricing`} )} - {jsonLdTypes.length > 0 && ( + {detectedSchemaTypes.length > 0 && (
-
Detected JSON-LD types
+
Detected schema types
- {jsonLdTypes.map((typeName) => ( + {detectedSchemaTypes.map((typeName) => ( )} + {wordpressAnalysis?.detected && ( +
+
+
+
+
WordPress plugin detection
+
+ {wordpressAnalysis.detectedPageCount || 0} analyzed page{(wordpressAnalysis.detectedPageCount || 0) === 1 ? '' : 's'} showed WordPress signals. +
+
+
+ {detectedWordPressPlugins.length > 0 ? detectedWordPressPlugins.map((plugin) => ( + + {(plugin.label || plugin.key || 'Plugin')} + {plugin.confidence ? ` (${plugin.confidence})` : ''} + + )) : ( + + WordPress detected, but no common plugins were identified yet + + )} +
+
+ +
+ {wordpressSchemaOwnership && ( +
+
Schema ownership guidance
+
{wordpressSchemaOwnership.label}
+

{wordpressSchemaOwnership.summary}

+ {wordpressSchemaOwnership.recommendedImplementation && ( +

+ Recommended path: {wordpressSchemaOwnership.recommendedImplementation} +

+ )} +
+ )} + + {wordpressDuplicateRisk && ( +
+
+
Duplicate-schema risk
+ + {wordpressDuplicateRisk.label || 'Risk review recommended'} + +
+ {wordpressDuplicateRisk.summary && ( +

{wordpressDuplicateRisk.summary}

+ )} + {(wordpressDuplicateRisk.warnings || []).length > 0 && ( +
    + {(wordpressDuplicateRisk.warnings || []).map((warning) => ( +
  • {warning}
  • + ))} +
+ )} +
+ )} + + {wordpressPluginRecommendations.length > 0 && ( +
+
Plugin-specific implementation recommendations
+
+ {wordpressPluginRecommendations.map((recommendation) => ( +
+
+
{recommendation.title || recommendation.label || recommendation.key || 'Plugin guidance'}
+ {recommendation.priority && ( + + {recommendation.priority} + + )} +
+ {recommendation.summary && ( +

{recommendation.summary}

+ )} + {recommendation.recommendedApproach && ( +

+ Recommended approach: {recommendation.recommendedApproach} +

+ )} + {(recommendation.applicableSchemaTypes || []).length > 0 && ( +
+ {(recommendation.applicableSchemaTypes || []).map((schemaType) => ( + + {schemaType} + + ))} +
+ )} +
+ ))} +
+
+ )} +
+
+
+ )} + {invalidJsonLdBlocks.length > 0 && (
Invalid JSON-LD detected
@@ -2528,9 +2722,9 @@ https://example.com/pricing`} className='border-0 bg-slate-50 px-3 py-3 text-sm text-slate-600 dark:bg-slate-900/60 dark:text-slate-300' descriptionClassName='leading-normal text-inherit dark:text-inherit' > - {(page.jsonLdTypes || []).length > 0 ? ( + {(page.schemaTypes || page.jsonLdTypes || []).length > 0 ? (
- {(page.jsonLdTypes || []).slice(0, 4).map((typeName) => ( + {(page.schemaTypes || page.jsonLdTypes || []).slice(0, 4).map((typeName) => ( ) : ( -
No JSON-LD types were detected on this page.
+
No schema types were identified on this page yet.
+ )} + + + {(page.wordpress?.plugins || []).length > 0 ? ( +
+ {(page.wordpress?.plugins || []).slice(0, 4).map((plugin) => ( + + {plugin.label || plugin.key || 'Plugin'} + {plugin.confidence ? ` (${plugin.confidence})` : ''} + + ))} +
+ ) : ( +
No common WordPress plugins were identified on this page.
)}
@@ -3256,6 +3471,46 @@ https://example.com/pricing`} description={selectedPlatformMeta.liveStatus} /> + {selectedPlatform === 'wordpress' && (detectedWordPressPlugins.length > 0 || wordpressSchemaOwnership || wordpressDuplicateRisk || wordpressPluginRecommendations.length > 0) && ( + +
+
+ {detectedWordPressPlugins.length > 0 ? detectedWordPressPlugins.map((plugin) => ( + + {plugin.label || plugin.key || 'Plugin'} + + )) : ( + + No common plugins detected + + )} + {wordpressDuplicateRisk?.label && ( + + {wordpressDuplicateRisk.label} + + )} +
+ {wordpressPluginRecommendations.length > 0 && ( +
+ {wordpressPluginRecommendations.slice(0, 2).map((recommendation) => ( +
+ {recommendation.label || recommendation.key || 'Plugin'}:{' '} + {recommendation.recommendedApproach || recommendation.summary || 'Review plugin output before coding schema changes.'} +
+ ))} +
+ )} +
+
+ )} + 0