217 lines
8.7 KiB
Python
217 lines
8.7 KiB
Python
import httpx
|
|
from celery import shared_task
|
|
from django.utils import timezone
|
|
from core.models import Bookmark, Extraction, Summary
|
|
from ai.local_ai_api import LocalAIApi
|
|
from bs4 import BeautifulSoup
|
|
import html2text
|
|
import logging
|
|
from urllib.parse import urlparse
|
|
from taggit.models import Tag
|
|
import json
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
DEFAULT_HEADERS = {
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
|
}
|
|
|
|
def get_base_url(url):
|
|
parsed = urlparse(url)
|
|
return f"{parsed.scheme}://{parsed.netloc}/"
|
|
|
|
@shared_task(bind=True, max_retries=3)
|
|
def process_bookmark(self, bookmark_id):
|
|
try:
|
|
bookmark = Bookmark.objects.get(id=bookmark_id)
|
|
except Bookmark.DoesNotExist:
|
|
return
|
|
|
|
html_content = ""
|
|
status_code = None
|
|
content_type = None
|
|
used_backup = False
|
|
|
|
try:
|
|
with httpx.Client(follow_redirects=True, timeout=20.0, headers=DEFAULT_HEADERS) as client:
|
|
response = client.get(bookmark.url)
|
|
response.raise_for_status()
|
|
html_content = response.text
|
|
status_code = response.status_code
|
|
content_type = response.headers.get('content-type')
|
|
|
|
# If content is too small, maybe it's a redirect or anti-bot page
|
|
if len(html_content) < 500:
|
|
raise ValueError("Content too small, likely failed to scrape meaningful data.")
|
|
|
|
except Exception as exc:
|
|
logger.warning(f"Error fetching bookmark {bookmark_id} ({bookmark.url}): {exc}. Trying base domain backup.")
|
|
try:
|
|
base_url = get_base_url(bookmark.url)
|
|
if base_url.rstrip('/') != bookmark.url.rstrip('/'):
|
|
with httpx.Client(follow_redirects=True, timeout=20.0, headers=DEFAULT_HEADERS) as client:
|
|
response = client.get(base_url)
|
|
response.raise_for_status()
|
|
html_content = response.text
|
|
status_code = response.status_code
|
|
content_type = response.headers.get('content-type')
|
|
used_backup = True
|
|
else:
|
|
if not html_content:
|
|
raise exc
|
|
except Exception as base_exc:
|
|
logger.error(f"Error fetching base domain for bookmark {bookmark_id}: {base_exc}")
|
|
if not html_content:
|
|
html_content = f"<html><body><p>Failed to retrieve content from {bookmark.url} and its base domain.</p></body></html>"
|
|
status_code = status_code or 0
|
|
|
|
soup = BeautifulSoup(html_content, 'html.parser')
|
|
|
|
# Simple title extraction if not already set
|
|
if not bookmark.title:
|
|
title_tag = soup.find('title')
|
|
if title_tag:
|
|
bookmark.title = title_tag.string.strip()[:255]
|
|
bookmark.save()
|
|
|
|
# Readability extraction
|
|
h = html2text.HTML2Text()
|
|
h.ignore_links = False
|
|
h.ignore_images = True
|
|
text_content = h.handle(html_content)
|
|
|
|
extraction, created = Extraction.objects.update_or_create(
|
|
bookmark=bookmark,
|
|
defaults={
|
|
'content_html': html_content,
|
|
'content_text': text_content,
|
|
'metadata': {
|
|
'status_code': status_code,
|
|
'content_type': content_type,
|
|
'used_backup': used_backup,
|
|
}
|
|
}
|
|
)
|
|
|
|
# AI Summary generation
|
|
generate_summary.delay(bookmark_id)
|
|
|
|
return f"Processed bookmark {bookmark_id}"
|
|
|
|
@shared_task
|
|
def generate_summary(bookmark_id):
|
|
try:
|
|
bookmark = Bookmark.objects.get(id=bookmark_id)
|
|
except Bookmark.DoesNotExist:
|
|
return
|
|
|
|
try:
|
|
extraction = bookmark.extraction
|
|
except Extraction.DoesNotExist:
|
|
Summary.objects.update_or_create(
|
|
bookmark=bookmark,
|
|
defaults={'content': "Content extraction failed or is still in progress. AI summary cannot be generated."}
|
|
)
|
|
return
|
|
|
|
content_to_summarize = extraction.content_text.strip()
|
|
used_backup = extraction.metadata.get('used_backup', False)
|
|
|
|
if not content_to_summarize or len(content_to_summarize) < 50:
|
|
Summary.objects.update_or_create(
|
|
bookmark=bookmark,
|
|
defaults={'content': f"Insufficient content extracted from {bookmark.url} to generate a meaningful AI summary."}
|
|
)
|
|
return
|
|
|
|
# Check if we should generate tags (only if bookmark has no tags)
|
|
should_generate_tags = bookmark.tags.count() == 0
|
|
existing_tags = list(Tag.objects.values_list('name', flat=True).distinct()[:50])
|
|
existing_tags_str = ", ".join(existing_tags)
|
|
|
|
# Prepare prompt for AI
|
|
system_prompt = "You are a helpful assistant that summarizes web content and suggests tags for researchers. Be concise and professional. Always return response in JSON format."
|
|
|
|
user_prompt = f"Analyze the following content from the webpage '{bookmark.title or bookmark.url}'.\n\n"
|
|
user_prompt += "1. Provide a summary in 2-3 concise sentences.\n"
|
|
|
|
if should_generate_tags:
|
|
user_prompt += "2. Suggest 3-5 short and concise tags for this content.\n"
|
|
if existing_tags:
|
|
user_prompt += f"Prioritize these existing tags if they match: {existing_tags_str}\n"
|
|
|
|
user_prompt += "\nReturn your response in valid JSON format:\n"
|
|
user_prompt += "{\n \"summary\": \"your summary here\""
|
|
if should_generate_tags:
|
|
user_prompt += ",\n \"tags\": [\"tag1\", \"tag2\", \"tag3\"]\n"
|
|
else:
|
|
user_prompt += "\n"
|
|
user_prompt += "}\n\n"
|
|
user_prompt += f"Content:\n{content_to_summarize[:4000]}"
|
|
|
|
try:
|
|
logger.info(f"Generating summary/tags for bookmark {bookmark_id}...")
|
|
response = LocalAIApi.create_response({
|
|
"input": [
|
|
{"role": "system", "content": system_prompt},
|
|
{"role": "user", "content": user_prompt},
|
|
],
|
|
# "response_format": {"type": "json_object"} # Some proxies might not like this
|
|
})
|
|
|
|
summary_text = None
|
|
suggested_tags = []
|
|
|
|
if response.get("success"):
|
|
raw_text = LocalAIApi.extract_text(response)
|
|
logger.info(f"AI Raw Response for {bookmark_id}: {raw_text}")
|
|
data = LocalAIApi.decode_json_from_response(response)
|
|
if data:
|
|
summary_text = data.get("summary")
|
|
suggested_tags = data.get("tags", [])
|
|
logger.info(f"Decoded JSON for {bookmark_id}: summary={bool(summary_text)}, tags={suggested_tags}")
|
|
else:
|
|
logger.warning(f"JSON decoding failed for {bookmark_id}. Fallback to text.")
|
|
summary_text = raw_text
|
|
|
|
if summary_text and len(summary_text.strip()) > 10:
|
|
Summary.objects.update_or_create(
|
|
bookmark=bookmark,
|
|
defaults={'content': summary_text.strip()}
|
|
)
|
|
|
|
# Add tags if we should
|
|
if should_generate_tags and suggested_tags:
|
|
# Limit to 5 tags and ensure they are strings
|
|
valid_tags = [str(t)[:50] for t in suggested_tags if t][:5]
|
|
if valid_tags:
|
|
bookmark.tags.add(*valid_tags)
|
|
logger.info(f"Successfully added tags {valid_tags} to bookmark {bookmark_id}")
|
|
return f"Generated summary and tags for bookmark {bookmark_id}"
|
|
|
|
return f"Generated summary for bookmark {bookmark_id}"
|
|
else:
|
|
error_msg = response.get('error') or "Empty response from AI"
|
|
logger.error(f"Failed to generate summary for bookmark {bookmark_id}: {error_msg}")
|
|
|
|
# Create a fallback summary to stop the spinner
|
|
fallback_content = "AI summary could not be generated at this time. "
|
|
if used_backup:
|
|
fallback_content += "The original page was unreachable, and the home page content was insufficient for a summary."
|
|
elif bookmark.title:
|
|
fallback_content += f"The page appears to be titled '{bookmark.title}'."
|
|
else:
|
|
fallback_content += f"Please visit the link directly: {bookmark.url}"
|
|
|
|
Summary.objects.update_or_create(
|
|
bookmark=bookmark,
|
|
defaults={'content': fallback_content}
|
|
)
|
|
return f"Failed to generate summary for bookmark {bookmark_id}, created fallback."
|
|
except Exception as e:
|
|
logger.error(f"Unexpected error in generate_summary for bookmark {bookmark_id}: {e}")
|
|
Summary.objects.update_or_create(
|
|
bookmark=bookmark,
|
|
defaults={'content': "An unexpected error occurred while generating the AI summary."}
|
|
)
|
|
return f"Error in generate_summary for bookmark {bookmark_id}" |