import httpx from celery import shared_task from django.utils import timezone from core.models import Bookmark, Extraction, Summary from ai.local_ai_api import LocalAIApi from bs4 import BeautifulSoup import html2text import logging from urllib.parse import urlparse logger = logging.getLogger(__name__) DEFAULT_HEADERS = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" } def get_base_url(url): parsed = urlparse(url) return f"{parsed.scheme}://{parsed.netloc}/" @shared_task(bind=True, max_retries=3) def process_bookmark(self, bookmark_id): try: bookmark = Bookmark.objects.get(id=bookmark_id) except Bookmark.DoesNotExist: return html_content = "" status_code = None content_type = None used_backup = False try: with httpx.Client(follow_redirects=True, timeout=20.0, headers=DEFAULT_HEADERS) as client: response = client.get(bookmark.url) response.raise_for_status() html_content = response.text status_code = response.status_code content_type = response.headers.get('content-type') # If content is too small, maybe it's a redirect or anti-bot page if len(html_content) < 500: raise ValueError("Content too small, likely failed to scrape meaningful data.") except Exception as exc: logger.warning(f"Error fetching bookmark {bookmark_id} ({bookmark.url}): {exc}. Trying base domain backup.") try: base_url = get_base_url(bookmark.url) if base_url.rstrip('/') != bookmark.url.rstrip('/'): with httpx.Client(follow_redirects=True, timeout=20.0, headers=DEFAULT_HEADERS) as client: response = client.get(base_url) response.raise_for_status() html_content = response.text status_code = response.status_code content_type = response.headers.get('content-type') used_backup = True else: if not html_content: raise exc except Exception as base_exc: logger.error(f"Error fetching base domain for bookmark {bookmark_id}: {base_exc}") if not html_content: html_content = f"

Failed to retrieve content from {bookmark.url} and its base domain.

" status_code = status_code or 0 soup = BeautifulSoup(html_content, 'html.parser') # Simple title extraction if not already set if not bookmark.title: title_tag = soup.find('title') if title_tag: bookmark.title = title_tag.string.strip()[:255] bookmark.save() # Readability extraction h = html2text.HTML2Text() h.ignore_links = False h.ignore_images = True text_content = h.handle(html_content) extraction, created = Extraction.objects.update_or_create( bookmark=bookmark, defaults={ 'content_html': html_content, 'content_text': text_content, 'metadata': { 'status_code': status_code, 'content_type': content_type, 'used_backup': used_backup, } } ) # AI Summary generation generate_summary.delay(bookmark_id) return f"Processed bookmark {bookmark_id}" @shared_task def generate_summary(bookmark_id): try: bookmark = Bookmark.objects.get(id=bookmark_id) extraction = bookmark.extraction except Bookmark.DoesNotExist: return except Extraction.DoesNotExist: # If extraction doesn't exist yet, we might want to wait or just return # But in EAGER mode it should be there. return content_to_summarize = extraction.content_text.strip() used_backup = extraction.metadata.get('used_backup', False) if not content_to_summarize or len(content_to_summarize) < 50: Summary.objects.update_or_create( bookmark=bookmark, defaults={'content': f"Insufficient content extracted from {bookmark.url} to generate a meaningful AI summary."} ) return # Prepare prompt for AI if used_backup: prompt = f"The specific page '{bookmark.url}' could not be reached. Summarize the main domain front page content instead to describe what this website is about.\n\nContent:\n{content_to_summarize[:4000]}" else: prompt = f"Summarize the following content from the webpage '{bookmark.title or bookmark.url}' in 2-3 concise sentences. Focus on the main points for a researcher.\n\nContent:\n{content_to_summarize[:4000]}" try: response = LocalAIApi.create_response({ "input": [ {"role": "system", "content": "You are a helpful assistant that summarizes web content for researchers and knowledge workers. Be concise and professional."}, {"role": "user", "content": prompt}, ], }) summary_text = None if response.get("success"): summary_text = LocalAIApi.extract_text(response) if summary_text and len(summary_text.strip()) > 10: Summary.objects.update_or_create( bookmark=bookmark, defaults={'content': summary_text.strip()} ) return f"Generated summary for bookmark {bookmark_id}" else: error_msg = response.get('error') or "Empty response from AI" logger.error(f"Failed to generate summary for bookmark {bookmark_id}: {error_msg}") # Create a fallback summary to stop the spinner fallback_content = "AI summary could not be generated at this time. " if used_backup: fallback_content += "The original page was unreachable, and the home page content was insufficient for a summary." elif bookmark.title: fallback_content += f"The page appears to be titled '{bookmark.title}'." else: fallback_content += f"Please visit the link directly: {bookmark.url}" Summary.objects.update_or_create( bookmark=bookmark, defaults={'content': fallback_content} ) return f"Failed to generate summary for bookmark {bookmark_id}, created fallback." except Exception as e: logger.exception(f"Unexpected error in generate_summary for bookmark {bookmark_id}: {e}") Summary.objects.update_or_create( bookmark=bookmark, defaults={'content': "An unexpected error occurred while generating the AI summary."} ) return f"Error in generate_summary for bookmark {bookmark_id}"