38191-vm/core/tasks.py
2026-02-09 04:37:40 +00:00

217 lines
8.7 KiB
Python

import httpx
from celery import shared_task
from django.utils import timezone
from core.models import Bookmark, Extraction, Summary
from ai.local_ai_api import LocalAIApi
from bs4 import BeautifulSoup
import html2text
import logging
from urllib.parse import urlparse
from taggit.models import Tag
import json
logger = logging.getLogger(__name__)
DEFAULT_HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
def get_base_url(url):
parsed = urlparse(url)
return f"{parsed.scheme}://{parsed.netloc}/"
@shared_task(bind=True, max_retries=3)
def process_bookmark(self, bookmark_id):
try:
bookmark = Bookmark.objects.get(id=bookmark_id)
except Bookmark.DoesNotExist:
return
html_content = ""
status_code = None
content_type = None
used_backup = False
try:
with httpx.Client(follow_redirects=True, timeout=20.0, headers=DEFAULT_HEADERS) as client:
response = client.get(bookmark.url)
response.raise_for_status()
html_content = response.text
status_code = response.status_code
content_type = response.headers.get('content-type')
# If content is too small, maybe it's a redirect or anti-bot page
if len(html_content) < 500:
raise ValueError("Content too small, likely failed to scrape meaningful data.")
except Exception as exc:
logger.warning(f"Error fetching bookmark {bookmark_id} ({bookmark.url}): {exc}. Trying base domain backup.")
try:
base_url = get_base_url(bookmark.url)
if base_url.rstrip('/') != bookmark.url.rstrip('/'):
with httpx.Client(follow_redirects=True, timeout=20.0, headers=DEFAULT_HEADERS) as client:
response = client.get(base_url)
response.raise_for_status()
html_content = response.text
status_code = response.status_code
content_type = response.headers.get('content-type')
used_backup = True
else:
if not html_content:
raise exc
except Exception as base_exc:
logger.error(f"Error fetching base domain for bookmark {bookmark_id}: {base_exc}")
if not html_content:
html_content = f"<html><body><p>Failed to retrieve content from {bookmark.url} and its base domain.</p></body></html>"
status_code = status_code or 0
soup = BeautifulSoup(html_content, 'html.parser')
# Simple title extraction if not already set
if not bookmark.title:
title_tag = soup.find('title')
if title_tag:
bookmark.title = title_tag.string.strip()[:255]
bookmark.save()
# Readability extraction
h = html2text.HTML2Text()
h.ignore_links = False
h.ignore_images = True
text_content = h.handle(html_content)
extraction, created = Extraction.objects.update_or_create(
bookmark=bookmark,
defaults={
'content_html': html_content,
'content_text': text_content,
'metadata': {
'status_code': status_code,
'content_type': content_type,
'used_backup': used_backup,
}
}
)
# AI Summary generation
generate_summary.delay(bookmark_id)
return f"Processed bookmark {bookmark_id}"
@shared_task
def generate_summary(bookmark_id):
try:
bookmark = Bookmark.objects.get(id=bookmark_id)
except Bookmark.DoesNotExist:
return
try:
extraction = bookmark.extraction
except Extraction.DoesNotExist:
Summary.objects.update_or_create(
bookmark=bookmark,
defaults={'content': "Content extraction failed or is still in progress. AI summary cannot be generated."}
)
return
content_to_summarize = extraction.content_text.strip()
used_backup = extraction.metadata.get('used_backup', False)
if not content_to_summarize or len(content_to_summarize) < 50:
Summary.objects.update_or_create(
bookmark=bookmark,
defaults={'content': f"Insufficient content extracted from {bookmark.url} to generate a meaningful AI summary."}
)
return
# Check if we should generate tags (only if bookmark has no tags)
should_generate_tags = bookmark.tags.count() == 0
existing_tags = list(Tag.objects.values_list('name', flat=True).distinct()[:50])
existing_tags_str = ", ".join(existing_tags)
# Prepare prompt for AI
system_prompt = "You are a helpful assistant that summarizes web content and suggests tags for researchers. Be concise and professional. Always return response in JSON format."
user_prompt = f"Analyze the following content from the webpage '{bookmark.title or bookmark.url}'.\n\n"
user_prompt += "1. Provide a summary in 2-3 concise sentences.\n"
if should_generate_tags:
user_prompt += "2. Suggest 3-5 short and concise tags for this content.\n"
if existing_tags:
user_prompt += f"Prioritize these existing tags if they match: {existing_tags_str}\n"
user_prompt += "\nReturn your response in valid JSON format:\n"
user_prompt += "{\n \"summary\": \"your summary here\""
if should_generate_tags:
user_prompt += ",\n \"tags\": [\"tag1\", \"tag2\", \"tag3\"]\n"
else:
user_prompt += "\n"
user_prompt += "}\n\n"
user_prompt += f"Content:\n{content_to_summarize[:4000]}"
try:
logger.info(f"Generating summary/tags for bookmark {bookmark_id}...")
response = LocalAIApi.create_response({
"input": [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt},
],
# "response_format": {"type": "json_object"} # Some proxies might not like this
})
summary_text = None
suggested_tags = []
if response.get("success"):
raw_text = LocalAIApi.extract_text(response)
logger.info(f"AI Raw Response for {bookmark_id}: {raw_text}")
data = LocalAIApi.decode_json_from_response(response)
if data:
summary_text = data.get("summary")
suggested_tags = data.get("tags", [])
logger.info(f"Decoded JSON for {bookmark_id}: summary={bool(summary_text)}, tags={suggested_tags}")
else:
logger.warning(f"JSON decoding failed for {bookmark_id}. Fallback to text.")
summary_text = raw_text
if summary_text and len(summary_text.strip()) > 10:
Summary.objects.update_or_create(
bookmark=bookmark,
defaults={'content': summary_text.strip()}
)
# Add tags if we should
if should_generate_tags and suggested_tags:
# Limit to 5 tags and ensure they are strings
valid_tags = [str(t)[:50] for t in suggested_tags if t][:5]
if valid_tags:
bookmark.tags.add(*valid_tags)
logger.info(f"Successfully added tags {valid_tags} to bookmark {bookmark_id}")
return f"Generated summary and tags for bookmark {bookmark_id}"
return f"Generated summary for bookmark {bookmark_id}"
else:
error_msg = response.get('error') or "Empty response from AI"
logger.error(f"Failed to generate summary for bookmark {bookmark_id}: {error_msg}")
# Create a fallback summary to stop the spinner
fallback_content = "AI summary could not be generated at this time. "
if used_backup:
fallback_content += "The original page was unreachable, and the home page content was insufficient for a summary."
elif bookmark.title:
fallback_content += f"The page appears to be titled '{bookmark.title}'."
else:
fallback_content += f"Please visit the link directly: {bookmark.url}"
Summary.objects.update_or_create(
bookmark=bookmark,
defaults={'content': fallback_content}
)
return f"Failed to generate summary for bookmark {bookmark_id}, created fallback."
except Exception as e:
logger.error(f"Unexpected error in generate_summary for bookmark {bookmark_id}: {e}")
Summary.objects.update_or_create(
bookmark=bookmark,
defaults={'content': "An unexpected error occurred while generating the AI summary."}
)
return f"Error in generate_summary for bookmark {bookmark_id}"