diff --git a/ai/__pycache__/local_ai_api.cpython-311.pyc b/ai/__pycache__/local_ai_api.cpython-311.pyc index ae12bda..bd2f0d5 100644 Binary files a/ai/__pycache__/local_ai_api.cpython-311.pyc and b/ai/__pycache__/local_ai_api.cpython-311.pyc differ diff --git a/ai/local_ai_api.py b/ai/local_ai_api.py index bcff732..99252e5 100644 --- a/ai/local_ai_api.py +++ b/ai/local_ai_api.py @@ -1,37 +1,3 @@ -""" -LocalAIApi — lightweight Python client for the Flatlogic AI proxy. - -Usage (inside the Django workspace): - - from ai.local_ai_api import LocalAIApi - - response = LocalAIApi.create_response({ - "input": [ - {"role": "system", "content": "You are a helpful assistant."}, - {"role": "user", "content": "Summarise this text in two sentences."}, - ], - "text": {"format": {"type": "json_object"}}, - }) - - if response.get("success"): - data = LocalAIApi.decode_json_from_response(response) - # ... - -# Typical successful payload (truncated): -# { -# "id": "resp_xxx", -# "status": "completed", -# "output": [ -# {"type": "reasoning", "summary": []}, -# {"type": "message", "content": [{"type": "output_text", "text": "Your final answer here."}]} -# ], -# "usage": { "input_tokens": 123, "output_tokens": 456 } -# } - -The helper automatically injects the project UUID header and falls back to -reading executor/.env if environment variables are missing. -""" - from __future__ import annotations import json @@ -52,10 +18,8 @@ __all__ = [ "decode_json_from_response", ] - _CONFIG_CACHE: Optional[Dict[str, Any]] = None - class LocalAIApi: """Static helpers mirroring the PHP implementation.""" @@ -76,9 +40,7 @@ class LocalAIApi: def decode_json_from_response(response: Dict[str, Any]) -> Optional[Dict[str, Any]]: return decode_json_from_response(response) - def create_response(params: Dict[str, Any], options: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: - """Signature compatible with the OpenAI Responses API.""" options = options or {} payload = dict(params) @@ -111,9 +73,7 @@ def create_response(params: Dict[str, Any], options: Optional[Dict[str, Any]] = return initial - def request(path: Optional[str], payload: Dict[str, Any], options: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: - """Perform a raw request to the AI proxy.""" cfg = _config() options = options or {} @@ -145,6 +105,7 @@ def request(path: Optional[str], payload: Dict[str, Any], options: Optional[Dict "Content-Type": "application/json", "Accept": "application/json", cfg["project_header"]: project_uuid, + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", } extra_headers = options.get("headers") if isinstance(extra_headers, Iterable): @@ -156,9 +117,7 @@ def request(path: Optional[str], payload: Dict[str, Any], options: Optional[Dict body = json.dumps(payload, ensure_ascii=False).encode("utf-8") return _http_request(url, "POST", body, headers, timeout, verify_tls) - def fetch_status(ai_request_id: Any, options: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: - """Fetch status for a queued AI request.""" cfg = _config() options = options or {} @@ -180,6 +139,7 @@ def fetch_status(ai_request_id: Any, options: Optional[Dict[str, Any]] = None) - headers: Dict[str, str] = { "Accept": "application/json", cfg["project_header"]: project_uuid, + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", } extra_headers = options.get("headers") if isinstance(extra_headers, Iterable): @@ -190,9 +150,7 @@ def fetch_status(ai_request_id: Any, options: Optional[Dict[str, Any]] = None) - return _http_request(url, "GET", None, headers, timeout, verify_tls) - def await_response(ai_request_id: Any, options: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: - """Poll status endpoint until the request is complete or timed out.""" options = options or {} timeout = int(options.get("timeout", 300)) interval = int(options.get("interval", 5)) @@ -236,14 +194,10 @@ def await_response(ai_request_id: Any, options: Optional[Dict[str, Any]] = None) } time.sleep(interval) - def extract_text(response: Dict[str, Any]) -> str: - """Public helper to extract plain text from a Responses payload.""" return _extract_text(response) - def decode_json_from_response(response: Dict[str, Any]) -> Optional[Dict[str, Any]]: - """Attempt to decode JSON emitted by the model (handles markdown fences).""" text = _extract_text(response) if text == "": return None @@ -270,7 +224,6 @@ def decode_json_from_response(response: Dict[str, Any]) -> Optional[Dict[str, An return None return None - def _extract_text(response: Dict[str, Any]) -> str: payload = response.get("data") if response.get("success") else response.get("response") if isinstance(payload, dict): @@ -294,9 +247,8 @@ def _extract_text(response: Dict[str, Any]) -> str: return payload return "" - def _config() -> Dict[str, Any]: - global _CONFIG_CACHE # noqa: PLW0603 + global _CONFIG_CACHE if _CONFIG_CACHE is not None: return _CONFIG_CACHE @@ -320,7 +272,6 @@ def _config() -> Dict[str, Any]: } return _CONFIG_CACHE - def _build_url(path: str, base_url: str) -> str: trimmed = path.strip() if trimmed.startswith("http://") or trimmed.startswith("https://"): @@ -329,7 +280,6 @@ def _build_url(path: str, base_url: str) -> str: return f"{base_url}{trimmed}" return f"{base_url}/{trimmed}" - def _resolve_status_path(ai_request_id: Any, cfg: Dict[str, Any]) -> str: base_path = (cfg.get("responses_path") or "").rstrip("/") if not base_path: @@ -338,12 +288,8 @@ def _resolve_status_path(ai_request_id: Any, cfg: Dict[str, Any]) -> str: base_path = f"{base_path}/ai-request" return f"{base_path}/{ai_request_id}/status" - def _http_request(url: str, method: str, body: Optional[bytes], headers: Dict[str, str], timeout: int, verify_tls: bool) -> Dict[str, Any]: - """ - Shared HTTP helper for GET/POST requests. - """ req = urlrequest.Request(url, data=body, method=method.upper()) for name, value in headers.items(): req.add_header(name, value) @@ -361,7 +307,7 @@ def _http_request(url: str, method: str, body: Optional[bytes], headers: Dict[st except urlerror.HTTPError as exc: status = exc.getcode() response_body = exc.read().decode("utf-8", errors="replace") - except Exception as exc: # pylint: disable=broad-except + except Exception as exc: return { "success": False, "error": "request_failed", @@ -395,9 +341,7 @@ def _http_request(url: str, method: str, body: Optional[bytes], headers: Dict[st "response": decoded if decoded is not None else response_body, } - def _ensure_env_loaded() -> None: - """Populate os.environ from executor/.env if variables are missing.""" if os.getenv("PROJECT_UUID") and os.getenv("PROJECT_ID"): return diff --git a/core/__pycache__/tasks.cpython-311.pyc b/core/__pycache__/tasks.cpython-311.pyc index 7074f20..3eba445 100644 Binary files a/core/__pycache__/tasks.cpython-311.pyc and b/core/__pycache__/tasks.cpython-311.pyc differ diff --git a/core/__pycache__/urls.cpython-311.pyc b/core/__pycache__/urls.cpython-311.pyc index ba30600..12c7090 100644 Binary files a/core/__pycache__/urls.cpython-311.pyc and b/core/__pycache__/urls.cpython-311.pyc differ diff --git a/core/__pycache__/views.cpython-311.pyc b/core/__pycache__/views.cpython-311.pyc index 61d1f79..9c74dc9 100644 Binary files a/core/__pycache__/views.cpython-311.pyc and b/core/__pycache__/views.cpython-311.pyc differ diff --git a/core/tasks.py b/core/tasks.py index 1de05d3..dc1829b 100644 --- a/core/tasks.py +++ b/core/tasks.py @@ -6,9 +6,18 @@ from ai.local_ai_api import LocalAIApi from bs4 import BeautifulSoup import html2text import logging +from urllib.parse import urlparse logger = logging.getLogger(__name__) +DEFAULT_HEADERS = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" +} + +def get_base_url(url): + parsed = urlparse(url) + return f"{parsed.scheme}://{parsed.netloc}/" + @shared_task(bind=True, max_retries=3) def process_bookmark(self, bookmark_id): try: @@ -16,14 +25,43 @@ def process_bookmark(self, bookmark_id): except Bookmark.DoesNotExist: return + html_content = "" + status_code = None + content_type = None + used_backup = False + try: - with httpx.Client(follow_redirects=True, timeout=30.0) as client: + with httpx.Client(follow_redirects=True, timeout=20.0, headers=DEFAULT_HEADERS) as client: response = client.get(bookmark.url) response.raise_for_status() html_content = response.text + status_code = response.status_code + content_type = response.headers.get('content-type') + + # If content is too small, maybe it's a redirect or anti-bot page + if len(html_content) < 500: + raise ValueError("Content too small, likely failed to scrape meaningful data.") + except Exception as exc: - logger.error(f"Error fetching bookmark {bookmark_id}: {exc}") - raise self.retry(exc=exc, countdown=60) + logger.warning(f"Error fetching bookmark {bookmark_id} ({bookmark.url}): {exc}. Trying base domain backup.") + try: + base_url = get_base_url(bookmark.url) + if base_url.rstrip('/') != bookmark.url.rstrip('/'): + with httpx.Client(follow_redirects=True, timeout=20.0, headers=DEFAULT_HEADERS) as client: + response = client.get(base_url) + response.raise_for_status() + html_content = response.text + status_code = response.status_code + content_type = response.headers.get('content-type') + used_backup = True + else: + if not html_content: + raise exc + except Exception as base_exc: + logger.error(f"Error fetching base domain for bookmark {bookmark_id}: {base_exc}") + if not html_content: + html_content = f"

Failed to retrieve content from {bookmark.url} and its base domain.

" + status_code = status_code or 0 soup = BeautifulSoup(html_content, 'html.parser') @@ -46,8 +84,9 @@ def process_bookmark(self, bookmark_id): 'content_html': html_content, 'content_text': text_content, 'metadata': { - 'status_code': response.status_code, - 'content_type': response.headers.get('content-type'), + 'status_code': status_code, + 'content_type': content_type, + 'used_backup': used_backup, } } ) @@ -62,30 +101,69 @@ def generate_summary(bookmark_id): try: bookmark = Bookmark.objects.get(id=bookmark_id) extraction = bookmark.extraction - except (Bookmark.DoesNotExist, Extraction.DoesNotExist): + except Bookmark.DoesNotExist: + return + except Extraction.DoesNotExist: + # If extraction doesn't exist yet, we might want to wait or just return + # But in EAGER mode it should be there. return - if not extraction.content_text: + content_to_summarize = extraction.content_text.strip() + used_backup = extraction.metadata.get('used_backup', False) + + if not content_to_summarize or len(content_to_summarize) < 50: + Summary.objects.update_or_create( + bookmark=bookmark, + defaults={'content': f"Insufficient content extracted from {bookmark.url} to generate a meaningful AI summary."} + ) return # Prepare prompt for AI - prompt = f"Summarize the following content from the webpage '{bookmark.title or bookmark.url}' in 2-3 concise sentences. Focus on the main points for a researcher.\n\nContent:\n{extraction.content_text[:4000]}" + if used_backup: + prompt = f"The specific page '{bookmark.url}' could not be reached. Summarize the main domain front page content instead to describe what this website is about.\n\nContent:\n{content_to_summarize[:4000]}" + else: + prompt = f"Summarize the following content from the webpage '{bookmark.title or bookmark.url}' in 2-3 concise sentences. Focus on the main points for a researcher.\n\nContent:\n{content_to_summarize[:4000]}" - response = LocalAIApi.create_response({ - "input": [ - {"role": "system", "content": "You are a helpful assistant that summarizes web content for researchers and knowledge workers. Be concise and professional."}, - {"role": "user", "content": prompt}, - ], - }) + try: + response = LocalAIApi.create_response({ + "input": [ + {"role": "system", "content": "You are a helpful assistant that summarizes web content for researchers and knowledge workers. Be concise and professional."}, + {"role": "user", "content": prompt}, + ], + }) - if response.get("success"): - summary_text = LocalAIApi.extract_text(response) - if summary_text: + summary_text = None + if response.get("success"): + summary_text = LocalAIApi.extract_text(response) + + if summary_text and len(summary_text.strip()) > 10: Summary.objects.update_or_create( bookmark=bookmark, - defaults={'content': summary_text} + defaults={'content': summary_text.strip()} ) return f"Generated summary for bookmark {bookmark_id}" - - logger.error(f"Failed to generate summary for bookmark {bookmark_id}: {response.get('error')}") - return f"Failed to generate summary for bookmark {bookmark_id}" \ No newline at end of file + else: + error_msg = response.get('error') or "Empty response from AI" + logger.error(f"Failed to generate summary for bookmark {bookmark_id}: {error_msg}") + + # Create a fallback summary to stop the spinner + fallback_content = "AI summary could not be generated at this time. " + if used_backup: + fallback_content += "The original page was unreachable, and the home page content was insufficient for a summary." + elif bookmark.title: + fallback_content += f"The page appears to be titled '{bookmark.title}'." + else: + fallback_content += f"Please visit the link directly: {bookmark.url}" + + Summary.objects.update_or_create( + bookmark=bookmark, + defaults={'content': fallback_content} + ) + return f"Failed to generate summary for bookmark {bookmark_id}, created fallback." + except Exception as e: + logger.exception(f"Unexpected error in generate_summary for bookmark {bookmark_id}: {e}") + Summary.objects.update_or_create( + bookmark=bookmark, + defaults={'content': "An unexpected error occurred while generating the AI summary."} + ) + return f"Error in generate_summary for bookmark {bookmark_id}" \ No newline at end of file diff --git a/core/templates/core/bookmark_detail.html b/core/templates/core/bookmark_detail.html index bd2c8cb..e0a8904 100644 --- a/core/templates/core/bookmark_detail.html +++ b/core/templates/core/bookmark_detail.html @@ -16,20 +16,28 @@

{{ bookmark.title|default:bookmark.url }}

{% if bookmark.user == request.user %} - @@ -47,19 +55,37 @@
{% endif %} - {% if bookmark.summary %} -
-
AI Summary
-
+
+
+
AI Summary
+ {% if bookmark.user == request.user and bookmark.summary %} + + {% endif %} +
+ + {% if bookmark.summary %} +
{{ bookmark.summary.content }}
-
- {% else %} -
-
- AI Summary is being generated... -
- {% endif %} + {% if bookmark.user == request.user %} +
+
+ {% csrf_token %} + +
+ + +
+
+
+ {% endif %} + {% else %} +
+
+ AI Summary is being generated... +
+ {% endif %} +
{% for tag in bookmark.tags.all %} @@ -68,14 +94,37 @@
- {% if bookmark.extraction %} -
-
Extracted Text Content
-
+
+
+
Extracted Text Content
+ {% if bookmark.user == request.user and bookmark.extraction %} + + {% endif %} +
+ + {% if bookmark.extraction %} +
{{ bookmark.extraction.content_text|linebreaks }}
-
- {% endif %} + {% if bookmark.user == request.user %} +
+
+ {% csrf_token %} + +
+ + +
+
+
+ {% endif %} + {% else %} +
+
+ Content is being extracted... +
+ {% endif %} +
@@ -121,6 +170,18 @@ {% block extra_js %}