193 lines
6.5 KiB
Python
193 lines
6.5 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Recipe Validation Script for Flatlogic
|
|
Run this on your recipe dataset before delivery
|
|
"""
|
|
|
|
import json
|
|
import gzip
|
|
import re
|
|
from pathlib import Path
|
|
|
|
# Keywords that should appear in ingredients if in title
|
|
FOOD_KEYWORDS = {
|
|
'chicken', 'beef', 'pork', 'turkey', 'lamb', 'fish', 'salmon', 'tuna', 'shrimp',
|
|
'pasta', 'spaghetti', 'rice', 'quinoa', 'potato', 'potatoes', 'sweet potato',
|
|
'tomato', 'tomatoes', 'onion', 'onions', 'garlic', 'carrot', 'carrots',
|
|
'broccoli', 'spinach', 'mushroom', 'mushrooms', 'pepper', 'peppers',
|
|
'zucchini', 'squash', 'butternut squash', 'eggplant', 'cheese', 'cheddar',
|
|
'mozzarella', 'parmesan', 'milk', 'butter', 'cream', 'egg', 'eggs',
|
|
'bread', 'tortilla', 'pita', 'peanut butter', 'jam', 'jelly', 'chocolate',
|
|
'apple', 'apples', 'banana', 'bananas', 'orange', 'lemon', 'lime',
|
|
'strawberry', 'blueberry', 'bacon', 'sausage', 'ham', 'bean', 'beans',
|
|
'chickpea', 'chickpeas', 'lentil', 'lentils', 'corn', 'peas', 'avocado',
|
|
'tofu', 'nuts', 'almond', 'almonds', 'walnut', 'walnuts', 'coconut',
|
|
'pineapple', 'mango', 'oil', 'olive oil', 'vinegar', 'soy sauce',
|
|
'flour', 'sugar', 'honey', 'maple syrup', 'salt', 'pepper'
|
|
}
|
|
|
|
# Recipe types to exclude from matching (the end product, not an ingredient)
|
|
EXCLUDE_RECIPE_TYPES = {
|
|
'bread', 'cake', 'pie', 'cookies', 'muffins', 'brownies', 'bars',
|
|
'soup', 'stew', 'chili', 'sauce', 'gravy', 'dip', 'spread',
|
|
'salad', 'slaw', 'casserole', 'lasagna', 'pizza', 'smoothie',
|
|
'shake', 'cocktail', 'drink', 'burger', 'sandwich', 'wrap', 'taco', 'burrito'
|
|
}
|
|
|
|
def extract_food_keywords_from_title(title):
|
|
"""Extract food keywords that should be in ingredients"""
|
|
title_lower = title.lower()
|
|
|
|
# Clean title
|
|
title_lower = re.sub(r"'s\s+", ' ', title_lower)
|
|
title_lower = re.sub(r'\b(best|easy|quick|simple|homemade|perfect|delicious|amazing|ultimate|fried|baked|grilled|roasted|sauteed|steamed|boiled|poached)\b', '', title_lower)
|
|
title_lower = re.sub(r'\b(with|and|or|in|on)\b', ' ', title_lower)
|
|
|
|
found_keywords = []
|
|
for keyword in FOOD_KEYWORDS:
|
|
if keyword in EXCLUDE_RECIPE_TYPES:
|
|
continue
|
|
# Check for whole word matches
|
|
pattern = r'\b' + re.escape(keyword) + r'\b'
|
|
if re.search(pattern, title_lower):
|
|
found_keywords.append(keyword)
|
|
|
|
return found_keywords
|
|
|
|
def check_ingredients_match_title(recipe):
|
|
"""Check if title keywords are in ingredients"""
|
|
title_keywords = extract_food_keywords_from_title(recipe['name'])
|
|
if not title_keywords:
|
|
return True, [] # No specific food keywords in title
|
|
|
|
ingredients_str = ' '.join(recipe.get('ingredients', [])).lower()
|
|
|
|
missing = []
|
|
for keyword in title_keywords:
|
|
if keyword not in ingredients_str:
|
|
missing.append(keyword)
|
|
|
|
return len(missing) == 0, missing
|
|
|
|
def validate_recipe(recipe, idx):
|
|
"""Validate a single recipe"""
|
|
errors = []
|
|
|
|
# Required fields
|
|
if not recipe.get('name'):
|
|
errors.append("Missing name")
|
|
|
|
if not recipe.get('ingredients') or len(recipe['ingredients']) == 0:
|
|
errors.append("No ingredients")
|
|
elif len(recipe['ingredients']) < 2:
|
|
errors.append("Too few ingredients (minimum 2)")
|
|
|
|
if not recipe.get('steps') or len(recipe['steps']) == 0:
|
|
errors.append("No instructions")
|
|
elif len(recipe['steps']) < 2:
|
|
errors.append("Too few steps (minimum 2)")
|
|
|
|
# Check time
|
|
minutes = recipe.get('minutes')
|
|
if minutes is None:
|
|
errors.append("Missing time")
|
|
elif minutes < 1 or minutes > 1440: # More than 24 hours
|
|
errors.append(f"Invalid time: {minutes} minutes")
|
|
|
|
# Check title-ingredient match
|
|
matches, missing = check_ingredients_match_title(recipe)
|
|
if not matches:
|
|
errors.append(f"Title ingredients missing: {', '.join(missing)}")
|
|
|
|
return errors
|
|
|
|
def load_recipes(filepath):
|
|
"""Load recipes from JSON or gzipped JSON"""
|
|
path = Path(filepath)
|
|
|
|
if path.suffix == '.gz':
|
|
with gzip.open(path, 'rt') as f:
|
|
return json.load(f)
|
|
else:
|
|
with open(path) as f:
|
|
return json.load(f)
|
|
|
|
def main():
|
|
import sys
|
|
|
|
if len(sys.argv) < 2:
|
|
print("Usage: python3 validate_recipes.py <recipes.json|recipes.json.gz>")
|
|
sys.exit(1)
|
|
|
|
filepath = sys.argv[1]
|
|
print(f"Loading recipes from {filepath}...")
|
|
|
|
try:
|
|
recipes = load_recipes(filepath)
|
|
except Exception as e:
|
|
print(f"Error loading file: {e}")
|
|
sys.exit(1)
|
|
|
|
print(f"\nTotal recipes: {len(recipes)}")
|
|
print("\nValidating...")
|
|
|
|
valid_count = 0
|
|
invalid_count = 0
|
|
errors_by_type = {}
|
|
sample_errors = []
|
|
|
|
for i, recipe in enumerate(recipes):
|
|
if i % 1000 == 0:
|
|
print(f" Processed {i}/{len(recipes)}...")
|
|
|
|
errors = validate_recipe(recipe, i)
|
|
|
|
if errors:
|
|
invalid_count += 1
|
|
for error in errors:
|
|
errors_by_type[error] = errors_by_type.get(error, 0) + 1
|
|
|
|
if len(sample_errors) < 5:
|
|
sample_errors.append({
|
|
'name': recipe.get('name', 'Unknown'),
|
|
'errors': errors
|
|
})
|
|
else:
|
|
valid_count += 1
|
|
|
|
# Report
|
|
print(f"\n{'='*60}")
|
|
print("VALIDATION RESULTS")
|
|
print(f"{'='*60}")
|
|
print(f"Total recipes: {len(recipes)}")
|
|
print(f"Valid recipes: {valid_count} ({valid_count/len(recipes)*100:.1f}%)")
|
|
print(f"Invalid recipes: {invalid_count} ({invalid_count/len(recipes)*100:.1f}%)")
|
|
|
|
print(f"\nError breakdown:")
|
|
for error, count in sorted(errors_by_type.items(), key=lambda x: x[1], reverse=True):
|
|
print(f" - {error}: {count} recipes")
|
|
|
|
if sample_errors:
|
|
print(f"\nSample errors:")
|
|
for item in sample_errors:
|
|
print(f" - {item['name']}: {', '.join(item['errors'])}")
|
|
|
|
# Quality score
|
|
quality_score = valid_count / len(recipes) * 100
|
|
print(f"\nQuality Score: {quality_score:.1f}%")
|
|
|
|
if quality_score >= 95:
|
|
print("✅ EXCELLENT - Ready for delivery")
|
|
elif quality_score >= 85:
|
|
print("⚠️ GOOD - Minor issues, acceptable for delivery")
|
|
elif quality_score >= 70:
|
|
print("❌ NEEDS WORK - Fix major issues before delivery")
|
|
else:
|
|
print("❌ REJECT - Significant data quality issues")
|
|
|
|
return quality_score
|
|
|
|
if __name__ == '__main__':
|
|
score = main()
|
|
sys.exit(0 if score >= 85 else 1)
|