import httpx from celery import shared_task from django.utils import timezone from core.models import Bookmark, Extraction, Summary from ai.local_ai_api import LocalAIApi from bs4 import BeautifulSoup import html2text import logging from urllib.parse import urlparse from taggit.models import Tag import json logger = logging.getLogger(__name__) DEFAULT_HEADERS = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" } def get_base_url(url): parsed = urlparse(url) return f"{parsed.scheme}://{parsed.netloc}/" @shared_task(bind=True, max_retries=3) def process_bookmark(self, bookmark_id): try: bookmark = Bookmark.objects.get(id=bookmark_id) except Bookmark.DoesNotExist: return html_content = "" status_code = None content_type = None used_backup = False try: with httpx.Client(follow_redirects=True, timeout=20.0, headers=DEFAULT_HEADERS) as client: response = client.get(bookmark.url) response.raise_for_status() html_content = response.text status_code = response.status_code content_type = response.headers.get('content-type') # If content is too small, maybe it's a redirect or anti-bot page if len(html_content) < 500: raise ValueError("Content too small, likely failed to scrape meaningful data.") except Exception as exc: logger.warning(f"Error fetching bookmark {bookmark_id} ({bookmark.url}): {exc}. Trying base domain backup.") try: base_url = get_base_url(bookmark.url) if base_url.rstrip('/') != bookmark.url.rstrip('/'): with httpx.Client(follow_redirects=True, timeout=20.0, headers=DEFAULT_HEADERS) as client: response = client.get(base_url) response.raise_for_status() html_content = response.text status_code = response.status_code content_type = response.headers.get('content-type') used_backup = True else: if not html_content: raise exc except Exception as base_exc: logger.error(f"Error fetching base domain for bookmark {bookmark_id}: {base_exc}") if not html_content: html_content = f"

Failed to retrieve content from {bookmark.url} and its base domain.

" status_code = status_code or 0 soup = BeautifulSoup(html_content, 'html.parser') # Simple title extraction if not already set if not bookmark.title: title_tag = soup.find('title') if title_tag: bookmark.title = title_tag.string.strip()[:255] bookmark.save() # Readability extraction h = html2text.HTML2Text() h.ignore_links = False h.ignore_images = True text_content = h.handle(html_content) extraction, created = Extraction.objects.update_or_create( bookmark=bookmark, defaults={ 'content_html': html_content, 'content_text': text_content, 'metadata': { 'status_code': status_code, 'content_type': content_type, 'used_backup': used_backup, } } ) # AI Summary generation generate_summary.delay(bookmark_id) return f"Processed bookmark {bookmark_id}" @shared_task def generate_summary(bookmark_id): try: bookmark = Bookmark.objects.get(id=bookmark_id) except Bookmark.DoesNotExist: return try: extraction = bookmark.extraction except Extraction.DoesNotExist: Summary.objects.update_or_create( bookmark=bookmark, defaults={'content': "Content extraction failed or is still in progress. AI summary cannot be generated."} ) return content_to_summarize = extraction.content_text.strip() used_backup = extraction.metadata.get('used_backup', False) if not content_to_summarize or len(content_to_summarize) < 50: Summary.objects.update_or_create( bookmark=bookmark, defaults={'content': f"Insufficient content extracted from {bookmark.url} to generate a meaningful AI summary."} ) return # Check if we should generate tags (only if bookmark has no tags) should_generate_tags = bookmark.tags.count() == 0 existing_tags = list(Tag.objects.values_list('name', flat=True).distinct()[:50]) existing_tags_str = ", ".join(existing_tags) # Prepare prompt for AI system_prompt = "You are a helpful assistant that summarizes web content and suggests tags for researchers. Be concise and professional. Always return response in JSON format." user_prompt = f"Analyze the following content from the webpage '{bookmark.title or bookmark.url}'.\n\n" user_prompt += "1. Provide a summary in 2-3 concise sentences.\n" if should_generate_tags: user_prompt += "2. Suggest 3-5 short and concise tags for this content.\n" if existing_tags: user_prompt += f"Prioritize these existing tags if they match: {existing_tags_str}\n" user_prompt += "\nReturn your response in valid JSON format:\n" user_prompt += "{\n \"summary\": \"your summary here\"" if should_generate_tags: user_prompt += ",\n \"tags\": [\"tag1\", \"tag2\", \"tag3\"]\n" else: user_prompt += "\n" user_prompt += "}\n\n" user_prompt += f"Content:\n{content_to_summarize[:4000]}" try: logger.info(f"Generating summary/tags for bookmark {bookmark_id}...") response = LocalAIApi.create_response({ "input": [ {"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}, ], # "response_format": {"type": "json_object"} # Some proxies might not like this }) summary_text = None suggested_tags = [] if response.get("success"): raw_text = LocalAIApi.extract_text(response) logger.info(f"AI Raw Response for {bookmark_id}: {raw_text}") data = LocalAIApi.decode_json_from_response(response) if data: summary_text = data.get("summary") suggested_tags = data.get("tags", []) logger.info(f"Decoded JSON for {bookmark_id}: summary={bool(summary_text)}, tags={suggested_tags}") else: logger.warning(f"JSON decoding failed for {bookmark_id}. Fallback to text.") summary_text = raw_text if summary_text and len(summary_text.strip()) > 10: Summary.objects.update_or_create( bookmark=bookmark, defaults={'content': summary_text.strip()} ) # Add tags if we should if should_generate_tags and suggested_tags: # Limit to 5 tags and ensure they are strings valid_tags = [str(t)[:50] for t in suggested_tags if t][:5] if valid_tags: bookmark.tags.add(*valid_tags) logger.info(f"Successfully added tags {valid_tags} to bookmark {bookmark_id}") return f"Generated summary and tags for bookmark {bookmark_id}" return f"Generated summary for bookmark {bookmark_id}" else: error_msg = response.get('error') or "Empty response from AI" logger.error(f"Failed to generate summary for bookmark {bookmark_id}: {error_msg}") # Create a fallback summary to stop the spinner fallback_content = "AI summary could not be generated at this time. " if used_backup: fallback_content += "The original page was unreachable, and the home page content was insufficient for a summary." elif bookmark.title: fallback_content += f"The page appears to be titled '{bookmark.title}'." else: fallback_content += f"Please visit the link directly: {bookmark.url}" Summary.objects.update_or_create( bookmark=bookmark, defaults={'content': fallback_content} ) return f"Failed to generate summary for bookmark {bookmark_id}, created fallback." except Exception as e: logger.error(f"Unexpected error in generate_summary for bookmark {bookmark_id}: {e}") Summary.objects.update_or_create( bookmark=bookmark, defaults={'content': "An unexpected error occurred while generating the AI summary."} ) return f"Error in generate_summary for bookmark {bookmark_id}"