38191-vm/core/tasks.py
2026-02-07 02:35:09 +00:00

169 lines
6.8 KiB
Python

import httpx
from celery import shared_task
from django.utils import timezone
from core.models import Bookmark, Extraction, Summary
from ai.local_ai_api import LocalAIApi
from bs4 import BeautifulSoup
import html2text
import logging
from urllib.parse import urlparse
logger = logging.getLogger(__name__)
DEFAULT_HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
def get_base_url(url):
parsed = urlparse(url)
return f"{parsed.scheme}://{parsed.netloc}/"
@shared_task(bind=True, max_retries=3)
def process_bookmark(self, bookmark_id):
try:
bookmark = Bookmark.objects.get(id=bookmark_id)
except Bookmark.DoesNotExist:
return
html_content = ""
status_code = None
content_type = None
used_backup = False
try:
with httpx.Client(follow_redirects=True, timeout=20.0, headers=DEFAULT_HEADERS) as client:
response = client.get(bookmark.url)
response.raise_for_status()
html_content = response.text
status_code = response.status_code
content_type = response.headers.get('content-type')
# If content is too small, maybe it's a redirect or anti-bot page
if len(html_content) < 500:
raise ValueError("Content too small, likely failed to scrape meaningful data.")
except Exception as exc:
logger.warning(f"Error fetching bookmark {bookmark_id} ({bookmark.url}): {exc}. Trying base domain backup.")
try:
base_url = get_base_url(bookmark.url)
if base_url.rstrip('/') != bookmark.url.rstrip('/'):
with httpx.Client(follow_redirects=True, timeout=20.0, headers=DEFAULT_HEADERS) as client:
response = client.get(base_url)
response.raise_for_status()
html_content = response.text
status_code = response.status_code
content_type = response.headers.get('content-type')
used_backup = True
else:
if not html_content:
raise exc
except Exception as base_exc:
logger.error(f"Error fetching base domain for bookmark {bookmark_id}: {base_exc}")
if not html_content:
html_content = f"<html><body><p>Failed to retrieve content from {bookmark.url} and its base domain.</p></body></html>"
status_code = status_code or 0
soup = BeautifulSoup(html_content, 'html.parser')
# Simple title extraction if not already set
if not bookmark.title:
title_tag = soup.find('title')
if title_tag:
bookmark.title = title_tag.string.strip()[:255]
bookmark.save()
# Readability extraction
h = html2text.HTML2Text()
h.ignore_links = False
h.ignore_images = True
text_content = h.handle(html_content)
extraction, created = Extraction.objects.update_or_create(
bookmark=bookmark,
defaults={
'content_html': html_content,
'content_text': text_content,
'metadata': {
'status_code': status_code,
'content_type': content_type,
'used_backup': used_backup,
}
}
)
# AI Summary generation
generate_summary.delay(bookmark_id)
return f"Processed bookmark {bookmark_id}"
@shared_task
def generate_summary(bookmark_id):
try:
bookmark = Bookmark.objects.get(id=bookmark_id)
extraction = bookmark.extraction
except Bookmark.DoesNotExist:
return
except Extraction.DoesNotExist:
# If extraction doesn't exist yet, we might want to wait or just return
# But in EAGER mode it should be there.
return
content_to_summarize = extraction.content_text.strip()
used_backup = extraction.metadata.get('used_backup', False)
if not content_to_summarize or len(content_to_summarize) < 50:
Summary.objects.update_or_create(
bookmark=bookmark,
defaults={'content': f"Insufficient content extracted from {bookmark.url} to generate a meaningful AI summary."}
)
return
# Prepare prompt for AI
if used_backup:
prompt = f"The specific page '{bookmark.url}' could not be reached. Summarize the main domain front page content instead to describe what this website is about.\n\nContent:\n{content_to_summarize[:4000]}"
else:
prompt = f"Summarize the following content from the webpage '{bookmark.title or bookmark.url}' in 2-3 concise sentences. Focus on the main points for a researcher.\n\nContent:\n{content_to_summarize[:4000]}"
try:
response = LocalAIApi.create_response({
"input": [
{"role": "system", "content": "You are a helpful assistant that summarizes web content for researchers and knowledge workers. Be concise and professional."},
{"role": "user", "content": prompt},
],
})
summary_text = None
if response.get("success"):
summary_text = LocalAIApi.extract_text(response)
if summary_text and len(summary_text.strip()) > 10:
Summary.objects.update_or_create(
bookmark=bookmark,
defaults={'content': summary_text.strip()}
)
return f"Generated summary for bookmark {bookmark_id}"
else:
error_msg = response.get('error') or "Empty response from AI"
logger.error(f"Failed to generate summary for bookmark {bookmark_id}: {error_msg}")
# Create a fallback summary to stop the spinner
fallback_content = "AI summary could not be generated at this time. "
if used_backup:
fallback_content += "The original page was unreachable, and the home page content was insufficient for a summary."
elif bookmark.title:
fallback_content += f"The page appears to be titled '{bookmark.title}'."
else:
fallback_content += f"Please visit the link directly: {bookmark.url}"
Summary.objects.update_or_create(
bookmark=bookmark,
defaults={'content': fallback_content}
)
return f"Failed to generate summary for bookmark {bookmark_id}, created fallback."
except Exception as e:
logger.exception(f"Unexpected error in generate_summary for bookmark {bookmark_id}: {e}")
Summary.objects.update_or_create(
bookmark=bookmark,
defaults={'content': "An unexpected error occurred while generating the AI summary."}
)
return f"Error in generate_summary for bookmark {bookmark_id}"