38191-vm/core/tasks.py

import httpx
from celery import shared_task
from django.utils import timezone
from core.models import Bookmark, Extraction, Summary
from ai.local_ai_api import LocalAIApi
from bs4 import BeautifulSoup
import html2text
import logging
from urllib.parse import urlparse
from taggit.models import Tag
import json

logger = logging.getLogger(__name__)

DEFAULT_HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}

def get_base_url(url):
    parsed = urlparse(url)
    return f"{parsed.scheme}://{parsed.netloc}/"

@shared_task(bind=True, max_retries=3)
def process_bookmark(self, bookmark_id):
    try:
        bookmark = Bookmark.objects.get(id=bookmark_id)
    except Bookmark.DoesNotExist:
        return

    html_content = ""
    status_code = None
    content_type = None
    used_backup = False

    try:
        with httpx.Client(follow_redirects=True, timeout=20.0, headers=DEFAULT_HEADERS) as client:
            response = client.get(bookmark.url)
            response.raise_for_status()
            html_content = response.text
            status_code = response.status_code
            content_type = response.headers.get('content-type')

            # If content is too small, maybe it's a redirect or anti-bot page
            if len(html_content) < 500:
                raise ValueError("Content too small, likely failed to scrape meaningful data.")

    except Exception as exc:
        logger.warning(f"Error fetching bookmark {bookmark_id} ({bookmark.url}): {exc}. Trying base domain backup.")
        try:
            base_url = get_base_url(bookmark.url)
            if base_url.rstrip('/') != bookmark.url.rstrip('/'):
                with httpx.Client(follow_redirects=True, timeout=20.0, headers=DEFAULT_HEADERS) as client:
                    response = client.get(base_url)
                    response.raise_for_status()
                    html_content = response.text
                    status_code = response.status_code
                    content_type = response.headers.get('content-type')
                    used_backup = True
            else:
                if not html_content:
                    raise exc
        except Exception as base_exc:
            logger.error(f"Error fetching base domain for bookmark {bookmark_id}: {base_exc}")
            if not html_content:
                html_content = f"<html><body><p>Failed to retrieve content from {bookmark.url} and its base domain.</p></body></html>"
                status_code = status_code or 0

    soup = BeautifulSoup(html_content, 'html.parser')

    # Simple title extraction if not already set
    if not bookmark.title:
        title_tag = soup.find('title')
        if title_tag:
            bookmark.title = title_tag.string.strip()[:255]
            bookmark.save()

    # Readability extraction
    h = html2text.HTML2Text()
    h.ignore_links = False
    h.ignore_images = True
    text_content = h.handle(html_content)

    extraction, created = Extraction.objects.update_or_create(
        bookmark=bookmark,
        defaults={
            'content_html': html_content,
            'content_text': text_content,
            'metadata': {
                'status_code': status_code,
                'content_type': content_type,
                'used_backup': used_backup,
            }
        }
    )

    # AI Summary generation
    generate_summary.delay(bookmark_id)

    return f"Processed bookmark {bookmark_id}"

@shared_task
def generate_summary(bookmark_id):
    try:
        bookmark = Bookmark.objects.get(id=bookmark_id)
    except Bookmark.DoesNotExist:
        return

    try:
        extraction = bookmark.extraction
    except Extraction.DoesNotExist:
        Summary.objects.update_or_create(
            bookmark=bookmark,
            defaults={'content': "Content extraction failed or is still in progress. AI summary cannot be generated."}
        )
        return

    content_to_summarize = extraction.content_text.strip()
    used_backup = extraction.metadata.get('used_backup', False)

    if not content_to_summarize or len(content_to_summarize) < 50:
        Summary.objects.update_or_create(
            bookmark=bookmark,
            defaults={'content': f"Insufficient content extracted from {bookmark.url} to generate a meaningful AI summary."}
        )
        return

    # Check if we should generate tags (only if bookmark has no tags)
    should_generate_tags = bookmark.tags.count() == 0
    existing_tags = list(Tag.objects.values_list('name', flat=True).distinct()[:50])
    existing_tags_str = ", ".join(existing_tags)

    # Prepare prompt for AI
    system_prompt = "You are a helpful assistant that summarizes web content and suggests tags for researchers. Be concise and professional. Always return response in JSON format."

    user_prompt = f"Analyze the following content from the webpage '{bookmark.title or bookmark.url}'.\n\n"
    user_prompt += "1. Provide a summary in 2-3 concise sentences.\n"

    if should_generate_tags:
        user_prompt += "2. Suggest 3-5 short and concise tags for this content.\n"
        if existing_tags:
            user_prompt += f"Prioritize these existing tags if they match: {existing_tags_str}\n"

    user_prompt += "\nReturn your response in valid JSON format:\n"
    user_prompt += "{\n  \"summary\": \"your summary here\""
    if should_generate_tags:
        user_prompt += ",\n  \"tags\": [\"tag1\", \"tag2\", \"tag3\"]\n"
    else:
        user_prompt += "\n"
    user_prompt += "}\n\n"
    user_prompt += f"Content:\n{content_to_summarize[:4000]}"

    try:
        logger.info(f"Generating summary/tags for bookmark {bookmark_id}...")
        response = LocalAIApi.create_response({
            "input": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt},
            ],
            # "response_format": {"type": "json_object"} # Some proxies might not like this
        })

        summary_text = None
        suggested_tags = []

        if response.get("success"):
            raw_text = LocalAIApi.extract_text(response)
            logger.info(f"AI Raw Response for {bookmark_id}: {raw_text}")
            data = LocalAIApi.decode_json_from_response(response)
            if data:
                summary_text = data.get("summary")
                suggested_tags = data.get("tags", [])
                logger.info(f"Decoded JSON for {bookmark_id}: summary={bool(summary_text)}, tags={suggested_tags}")
            else:
                logger.warning(f"JSON decoding failed for {bookmark_id}. Fallback to text.")
                summary_text = raw_text

        if summary_text and len(summary_text.strip()) > 10:
            Summary.objects.update_or_create(
                bookmark=bookmark,
                defaults={'content': summary_text.strip()}
            )

            # Add tags if we should
            if should_generate_tags and suggested_tags:
                # Limit to 5 tags and ensure they are strings
                valid_tags = [str(t)[:50] for t in suggested_tags if t][:5]
                if valid_tags:
                    bookmark.tags.add(*valid_tags)
                    logger.info(f"Successfully added tags {valid_tags} to bookmark {bookmark_id}")
                    return f"Generated summary and tags for bookmark {bookmark_id}"

            return f"Generated summary for bookmark {bookmark_id}"
        else:
            error_msg = response.get('error') or "Empty response from AI"
            logger.error(f"Failed to generate summary for bookmark {bookmark_id}: {error_msg}")

            # Create a fallback summary to stop the spinner
            fallback_content = "AI summary could not be generated at this time. "
            if used_backup:
                fallback_content += "The original page was unreachable, and the home page content was insufficient for a summary."
            elif bookmark.title:
                fallback_content += f"The page appears to be titled '{bookmark.title}'."
            else:
                fallback_content += f"Please visit the link directly: {bookmark.url}"

            Summary.objects.update_or_create(
                bookmark=bookmark,
                defaults={'content': fallback_content}
            )
            return f"Failed to generate summary for bookmark {bookmark_id}, created fallback."
    except Exception as e:
        logger.error(f"Unexpected error in generate_summary for bookmark {bookmark_id}: {e}")
        Summary.objects.update_or_create(
            bookmark=bookmark,
            defaults={'content': "An unexpected error occurred while generating the AI summary."}
        )
        return f"Error in generate_summary for bookmark {bookmark_id}"