Autosave: 20260322-235115

2026-03-22 23:51:15 +00:00 · 2026-03-22 23:51:15 +00:00 · 69b734c063
commit 69b734c063
parent 3180c25595
2 changed files with 55 additions and 48 deletions
--- a/core/services/pycache/resolution.cpython-311.pyc
+++ b/core/services/pycache/resolution.cpython-311.pyc
--- a/core/services/resolution.py
+++ b/core/services/resolution.py
@ -2,7 +2,7 @@ import requests
 import logging
 from bs4 import BeautifulSoup
 from core.models import Entity, Relationship, Source
-from urllib.parse import urljoin, urlparse
+from urllib.parse import urljoin, quote

 logger = logging.getLogger(__name__)

@ -10,34 +10,39 @@ class WebCrawler:
    """
    Crawler to extract information from the web without relying on APIs.
    """
-    def __init__(self, start_url):
-        self.start_url = start_url
+    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
        })

-    def crawl(self, query):
+    def search(self, query):
        """
-        Main entry point for web crawling.
+        Perform a simulated search on Google using requests.
        """
-        logger.info(f"Starting crawl for: {query}")
-        # 1. Perform search queries on Google/Bing or specialized sites
-        # 2. Extract links and parse content
-        # 3. Identify new entities and relationships
-        return self._simulate_discovery(query)
-
-    def _simulate_discovery(self, query):
-        # This will be replaced by actual logic using BeautifulSoup/requests
-        return {
-            "entities": [
-                {"type": "PERSON", "value": query, "identifier": "WEB-ID-1"},
-                {"type": "PERSON", "value": "Associate of " + query, "identifier": "WEB-ID-2"},
-            ],
-            "relationships": [
-                {"source": query, "target": "Associate of " + query, "type": "ASSOCIATED_WITH"}
-            ]
-        }
+        search_url = f"https://www.google.com/search?q={quote(query)}"
+        logger.info(f"Crawling URL: {search_url}")
+        
+        try:
+            response = self.session.get(search_url, timeout=10)
+            response.raise_for_status()
+            soup = BeautifulSoup(response.text, "html.parser")
+            
+            # Simple extraction of titles/links from Google search results
+            results = []
+            # Selector for Google search result titles
+            for g in soup.select("div.g"):
+                title_elem = g.select_one("h3")
+                link_elem = g.select_one("a")
+                if title_elem and link_elem:
+                    results.append({
+                        "title": title_elem.get_text(),
+                        "url": link_elem.get("href")
+                    })
+            return results
+        except Exception as e:
+            logger.error(f"Search failed: {e}")
+            return []

 class NetworkDiscoveryService:
    @staticmethod
@ -46,38 +51,40 @@ class NetworkDiscoveryService:
        Performs discovery using Web Crawling.
        """
        try:
-            crawler = WebCrawler(start_url="https://www.google.com")
-            data = crawler.crawl(query)
+            crawler = WebCrawler()
+            search_results = crawler.search(query)
            
            source, _ = Source.objects.get_or_create(name='Web Crawler Engine')
            
-            person = None
-            for ent_data in data.get("entities", []):
-                entity, _ = Entity.objects.get_or_create(
-                    entity_type=ent_data['type'], 
-                    value=ent_data['value'], 
-                    source=source
-                )
-                entity.photo_url = f"https://api.dicebear.com/7.x/pixel-art/svg?seed={ent_data['value'].replace(' ', '+')}"
-                entity.identifier_code = ent_data.get('identifier', 'UNKNOWN')
-                entity.save()
-                
-                if ent_data['type'] == 'PERSON':
-                    person = entity
+            # 1. Create main entity
+            person, _ = Entity.objects.get_or_create(
+                entity_type='PERSON',
+                value=query,
+                source=source
+            )
+            person.photo_url = f"https://api.dicebear.com/7.x/pixel-art/svg?seed={query.replace(' ', '+')}"
+            person.save()

-            for rel_data in data.get("relationships", []):
-                s_entity = Entity.objects.filter(value=rel_data['source']).first()
-                t_entity = Entity.objects.filter(value=rel_data['target']).first()
-                
-                if s_entity and t_entity:
+            # 2. Extract potential associates from titles
+            for res in search_results:
+                # Naive associate detection
+                associate_val = res['title'][:50]
+                if associate_val != query:
+                    associate, _ = Entity.objects.get_or_create(
+                        entity_type='PERSON',
+                        value=associate_val,
+                        source=source
+                    )
+                    
+                    # 3. Create relationship
                    Relationship.objects.get_or_create(
-                        source_entity=s_entity,
-                        target_entity=t_entity,
-                        relationship_type=rel_data['type'],
-                        weight=0.9
+                        source_entity=person,
+                        target_entity=associate,
+                        relationship_type='ASSOCIATED_WITH',
+                        weight=0.5
                    )
            
-            return person or Entity.objects.filter(value=query).first()
+            return person

        except Exception as e:
            logger.error(f"Error performing web-based discovery for {query}: {e}")
@ -87,4 +94,4 @@ class EntityResolutionService:
    @staticmethod
    def resolve_identity(identifier_a, identifier_b, probability_threshold=0.8):
        # Implementation remains unchanged
-        return True
+        return True