38191-vm/core/tasks.py

import httpx
from celery import shared_task
from django.utils import timezone
from core.models import Bookmark, Extraction, Summary
from ai.local_ai_api import LocalAIApi
from bs4 import BeautifulSoup
import html2text
import logging
from urllib.parse import urlparse

logger = logging.getLogger(__name__)

DEFAULT_HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}

def get_base_url(url):
    parsed = urlparse(url)
    return f"{parsed.scheme}://{parsed.netloc}/"

@shared_task(bind=True, max_retries=3)
def process_bookmark(self, bookmark_id):
    try:
        bookmark = Bookmark.objects.get(id=bookmark_id)
    except Bookmark.DoesNotExist:
        return

    html_content = ""
    status_code = None
    content_type = None
    used_backup = False

    try:
        with httpx.Client(follow_redirects=True, timeout=20.0, headers=DEFAULT_HEADERS) as client:
            response = client.get(bookmark.url)
            response.raise_for_status()
            html_content = response.text
            status_code = response.status_code
            content_type = response.headers.get('content-type')

            # If content is too small, maybe it's a redirect or anti-bot page
            if len(html_content) < 500:
                raise ValueError("Content too small, likely failed to scrape meaningful data.")

    except Exception as exc:
        logger.warning(f"Error fetching bookmark {bookmark_id} ({bookmark.url}): {exc}. Trying base domain backup.")
        try:
            base_url = get_base_url(bookmark.url)
            if base_url.rstrip('/') != bookmark.url.rstrip('/'):
                with httpx.Client(follow_redirects=True, timeout=20.0, headers=DEFAULT_HEADERS) as client:
                    response = client.get(base_url)
                    response.raise_for_status()
                    html_content = response.text
                    status_code = response.status_code
                    content_type = response.headers.get('content-type')
                    used_backup = True
            else:
                if not html_content:
                    raise exc
        except Exception as base_exc:
            logger.error(f"Error fetching base domain for bookmark {bookmark_id}: {base_exc}")
            if not html_content:
                html_content = f"<html><body><p>Failed to retrieve content from {bookmark.url} and its base domain.</p></body></html>"
                status_code = status_code or 0

    soup = BeautifulSoup(html_content, 'html.parser')

    # Simple title extraction if not already set
    if not bookmark.title:
        title_tag = soup.find('title')
        if title_tag:
            bookmark.title = title_tag.string.strip()[:255]
            bookmark.save()

    # Readability extraction
    h = html2text.HTML2Text()
    h.ignore_links = False
    h.ignore_images = True
    text_content = h.handle(html_content)

    extraction, created = Extraction.objects.update_or_create(
        bookmark=bookmark,
        defaults={
            'content_html': html_content,
            'content_text': text_content,
            'metadata': {
                'status_code': status_code,
                'content_type': content_type,
                'used_backup': used_backup,
            }
        }
    )

    # AI Summary generation
    generate_summary.delay(bookmark_id)

    return f"Processed bookmark {bookmark_id}"

@shared_task
def generate_summary(bookmark_id):
    try:
        bookmark = Bookmark.objects.get(id=bookmark_id)
        extraction = bookmark.extraction
    except Bookmark.DoesNotExist:
        return
    except Extraction.DoesNotExist:
        # If extraction doesn't exist yet, we might want to wait or just return
        # But in EAGER mode it should be there.
        return

    content_to_summarize = extraction.content_text.strip()
    used_backup = extraction.metadata.get('used_backup', False)

    if not content_to_summarize or len(content_to_summarize) < 50:
        Summary.objects.update_or_create(
            bookmark=bookmark,
            defaults={'content': f"Insufficient content extracted from {bookmark.url} to generate a meaningful AI summary."}
        )
        return

    # Prepare prompt for AI
    if used_backup:
        prompt = f"The specific page '{bookmark.url}' could not be reached. Summarize the main domain front page content instead to describe what this website is about.\n\nContent:\n{content_to_summarize[:4000]}"
    else:
        prompt = f"Summarize the following content from the webpage '{bookmark.title or bookmark.url}' in 2-3 concise sentences. Focus on the main points for a researcher.\n\nContent:\n{content_to_summarize[:4000]}"

    try:
        response = LocalAIApi.create_response({
            "input": [
                {"role": "system", "content": "You are a helpful assistant that summarizes web content for researchers and knowledge workers. Be concise and professional."},
                {"role": "user", "content": prompt},
            ],
        })

        summary_text = None
        if response.get("success"):
            summary_text = LocalAIApi.extract_text(response)

        if summary_text and len(summary_text.strip()) > 10:
            Summary.objects.update_or_create(
                bookmark=bookmark,
                defaults={'content': summary_text.strip()}
            )
            return f"Generated summary for bookmark {bookmark_id}"
        else:
            error_msg = response.get('error') or "Empty response from AI"
            logger.error(f"Failed to generate summary for bookmark {bookmark_id}: {error_msg}")

            # Create a fallback summary to stop the spinner
            fallback_content = "AI summary could not be generated at this time. "
            if used_backup:
                fallback_content += "The original page was unreachable, and the home page content was insufficient for a summary."
            elif bookmark.title:
                fallback_content += f"The page appears to be titled '{bookmark.title}'."
            else:
                fallback_content += f"Please visit the link directly: {bookmark.url}"

            Summary.objects.update_or_create(
                bookmark=bookmark,
                defaults={'content': fallback_content}
            )
            return f"Failed to generate summary for bookmark {bookmark_id}, created fallback."
    except Exception as e:
        logger.exception(f"Unexpected error in generate_summary for bookmark {bookmark_id}: {e}")
        Summary.objects.update_or_create(
            bookmark=bookmark,
            defaults={'content': "An unexpected error occurred while generating the AI summary."}
        )
        return f"Error in generate_summary for bookmark {bookmark_id}"