Autosave: 20260322-235115

2026-03-22 23:51:15 +00:00 · 2026-03-22 23:51:15 +00:00 · 69b734c063
commit 69b734c063
parent 3180c25595
2 changed files with 55 additions and 48 deletions
--- a/core/services/pycache/resolution.cpython-311.pyc
+++ b/core/services/pycache/resolution.cpython-311.pyc
--- a/core/services/resolution.py
+++ b/core/services/resolution.py
@ -2,7 +2,7 @@ import requests
 import logging
 from bs4 import BeautifulSoup
 from core.models import Entity, Relationship, Source
-from urllib.parse import urljoin, urlparse
+from urllib.parse import urljoin, quote
 logger = logging.getLogger(__name__)
@ -10,34 +10,39 @@ class WebCrawler:
    """
    Crawler to extract information from the web without relying on APIs.
    """
-    def __init__(self, start_url):
+    def __init__(self):
        self.start_url = start_url
        self.session = requests.Session()
        self.session.headers.update({
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
        })
-    def crawl(self, query):
+    def search(self, query):
        """
-        Main entry point for web crawling.
+        Perform a simulated search on Google using requests.
        """
-        logger.info(f"Starting crawl for: {query}")
+        search_url = f"https://www.google.com/search?q={quote(query)}"
-        # 1. Perform search queries on Google/Bing or specialized sites
+        logger.info(f"Crawling URL: {search_url}")
-        # 2. Extract links and parse content
+        
-        # 3. Identify new entities and relationships
+        try:
-        return self._simulate_discovery(query)
+            response = self.session.get(search_url, timeout=10)
-
+            response.raise_for_status()
-    def _simulate_discovery(self, query):
+            soup = BeautifulSoup(response.text, "html.parser")
-        # This will be replaced by actual logic using BeautifulSoup/requests
+            
-        return {
+            # Simple extraction of titles/links from Google search results
-            "entities": [
+            results = []
-                {"type": "PERSON", "value": query, "identifier": "WEB-ID-1"},
+            # Selector for Google search result titles
-                {"type": "PERSON", "value": "Associate of " + query, "identifier": "WEB-ID-2"},
+            for g in soup.select("div.g"):
-            ],
+                title_elem = g.select_one("h3")
-            "relationships": [
+                link_elem = g.select_one("a")
-                {"source": query, "target": "Associate of " + query, "type": "ASSOCIATED_WITH"}
+                if title_elem and link_elem:
-            ]
+                    results.append({
-        }
+                        "title": title_elem.get_text(),
                        "url": link_elem.get("href")
                    })
            return results
        except Exception as e:
            logger.error(f"Search failed: {e}")
            return []
 class NetworkDiscoveryService:
    @staticmethod
@ -46,38 +51,40 @@ class NetworkDiscoveryService:
        Performs discovery using Web Crawling.
        """
        try:
-            crawler = WebCrawler(start_url="https://www.google.com")
+            crawler = WebCrawler()
-            data = crawler.crawl(query)
+            search_results = crawler.search(query)
            source, _ = Source.objects.get_or_create(name='Web Crawler Engine')
-            person = None
+            # 1. Create main entity
-            for ent_data in data.get("entities", []):
+            person, _ = Entity.objects.get_or_create(
-                entity, _ = Entity.objects.get_or_create(
+                entity_type='PERSON',
-                    entity_type=ent_data['type'], 
+                value=query,
-                    value=ent_data['value'], 
+                source=source
-                    source=source
+            )
-                )
+            person.photo_url = f"https://api.dicebear.com/7.x/pixel-art/svg?seed={query.replace(' ', '+')}"
-                entity.photo_url = f"https://api.dicebear.com/7.x/pixel-art/svg?seed={ent_data['value'].replace(' ', '+')}"
+            person.save()
                entity.identifier_code = ent_data.get('identifier', 'UNKNOWN')
                entity.save()
                if ent_data['type'] == 'PERSON':
                    person = entity
-            for rel_data in data.get("relationships", []):
+            # 2. Extract potential associates from titles
-                s_entity = Entity.objects.filter(value=rel_data['source']).first()
+            for res in search_results:
-                t_entity = Entity.objects.filter(value=rel_data['target']).first()
+                # Naive associate detection
-                
+                associate_val = res['title'][:50]
-                if s_entity and t_entity:
+                if associate_val != query:
                    associate, _ = Entity.objects.get_or_create(
                        entity_type='PERSON',
                        value=associate_val,
                        source=source
                    )
                    # 3. Create relationship
                    Relationship.objects.get_or_create(
-                        source_entity=s_entity,
+                        source_entity=person,
-                        target_entity=t_entity,
+                        target_entity=associate,
-                        relationship_type=rel_data['type'],
+                        relationship_type='ASSOCIATED_WITH',
-                        weight=0.9
+                        weight=0.5
                    )
-            return person or Entity.objects.filter(value=query).first()
+            return person
        except Exception as e:
            logger.error(f"Error performing web-based discovery for {query}: {e}")
@ -87,4 +94,4 @@ class EntityResolutionService:
    @staticmethod
    def resolve_identity(identifier_a, identifier_b, probability_threshold=0.8):
        # Implementation remains unchanged
-        return True
+        return True