91 lines
3.0 KiB
Python
91 lines
3.0 KiB
Python
import httpx
|
|
from celery import shared_task
|
|
from django.utils import timezone
|
|
from core.models import Bookmark, Extraction, Summary
|
|
from ai.local_ai_api import LocalAIApi
|
|
from bs4 import BeautifulSoup
|
|
import html2text
|
|
import logging
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
@shared_task(bind=True, max_retries=3)
|
|
def process_bookmark(self, bookmark_id):
|
|
try:
|
|
bookmark = Bookmark.objects.get(id=bookmark_id)
|
|
except Bookmark.DoesNotExist:
|
|
return
|
|
|
|
try:
|
|
with httpx.Client(follow_redirects=True, timeout=30.0) as client:
|
|
response = client.get(bookmark.url)
|
|
response.raise_for_status()
|
|
html_content = response.text
|
|
except Exception as exc:
|
|
logger.error(f"Error fetching bookmark {bookmark_id}: {exc}")
|
|
raise self.retry(exc=exc, countdown=60)
|
|
|
|
soup = BeautifulSoup(html_content, 'html.parser')
|
|
|
|
# Simple title extraction if not already set
|
|
if not bookmark.title:
|
|
title_tag = soup.find('title')
|
|
if title_tag:
|
|
bookmark.title = title_tag.string.strip()[:255]
|
|
bookmark.save()
|
|
|
|
# Readability extraction
|
|
h = html2text.HTML2Text()
|
|
h.ignore_links = False
|
|
h.ignore_images = True
|
|
text_content = h.handle(html_content)
|
|
|
|
extraction, created = Extraction.objects.update_or_create(
|
|
bookmark=bookmark,
|
|
defaults={
|
|
'content_html': html_content,
|
|
'content_text': text_content,
|
|
'metadata': {
|
|
'status_code': response.status_code,
|
|
'content_type': response.headers.get('content-type'),
|
|
}
|
|
}
|
|
)
|
|
|
|
# AI Summary generation
|
|
generate_summary.delay(bookmark_id)
|
|
|
|
return f"Processed bookmark {bookmark_id}"
|
|
|
|
@shared_task
|
|
def generate_summary(bookmark_id):
|
|
try:
|
|
bookmark = Bookmark.objects.get(id=bookmark_id)
|
|
extraction = bookmark.extraction
|
|
except (Bookmark.DoesNotExist, Extraction.DoesNotExist):
|
|
return
|
|
|
|
if not extraction.content_text:
|
|
return
|
|
|
|
# Prepare prompt for AI
|
|
prompt = f"Summarize the following content from the webpage '{bookmark.title or bookmark.url}' in 2-3 concise sentences. Focus on the main points for a researcher.\n\nContent:\n{extraction.content_text[:4000]}"
|
|
|
|
response = LocalAIApi.create_response({
|
|
"input": [
|
|
{"role": "system", "content": "You are a helpful assistant that summarizes web content for researchers and knowledge workers. Be concise and professional."},
|
|
{"role": "user", "content": prompt},
|
|
],
|
|
})
|
|
|
|
if response.get("success"):
|
|
summary_text = LocalAIApi.extract_text(response)
|
|
if summary_text:
|
|
Summary.objects.update_or_create(
|
|
bookmark=bookmark,
|
|
defaults={'content': summary_text}
|
|
)
|
|
return f"Generated summary for bookmark {bookmark_id}"
|
|
|
|
logger.error(f"Failed to generate summary for bookmark {bookmark_id}: {response.get('error')}")
|
|
return f"Failed to generate summary for bookmark {bookmark_id}" |