diff --git a/core/services/__pycache__/resolution.cpython-311.pyc b/core/services/__pycache__/resolution.cpython-311.pyc index 0d33ccc..7d58b8c 100644 Binary files a/core/services/__pycache__/resolution.cpython-311.pyc and b/core/services/__pycache__/resolution.cpython-311.pyc differ diff --git a/core/services/resolution.py b/core/services/resolution.py index a1ab8ad..571c2a2 100644 --- a/core/services/resolution.py +++ b/core/services/resolution.py @@ -2,7 +2,7 @@ import requests import logging from bs4 import BeautifulSoup from core.models import Entity, Relationship, Source -from urllib.parse import urljoin, urlparse +from urllib.parse import urljoin, quote logger = logging.getLogger(__name__) @@ -10,34 +10,39 @@ class WebCrawler: """ Crawler to extract information from the web without relying on APIs. """ - def __init__(self, start_url): - self.start_url = start_url + def __init__(self): self.session = requests.Session() self.session.headers.update({ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36" }) - def crawl(self, query): + def search(self, query): """ - Main entry point for web crawling. + Perform a simulated search on Google using requests. """ - logger.info(f"Starting crawl for: {query}") - # 1. Perform search queries on Google/Bing or specialized sites - # 2. Extract links and parse content - # 3. Identify new entities and relationships - return self._simulate_discovery(query) - - def _simulate_discovery(self, query): - # This will be replaced by actual logic using BeautifulSoup/requests - return { - "entities": [ - {"type": "PERSON", "value": query, "identifier": "WEB-ID-1"}, - {"type": "PERSON", "value": "Associate of " + query, "identifier": "WEB-ID-2"}, - ], - "relationships": [ - {"source": query, "target": "Associate of " + query, "type": "ASSOCIATED_WITH"} - ] - } + search_url = f"https://www.google.com/search?q={quote(query)}" + logger.info(f"Crawling URL: {search_url}") + + try: + response = self.session.get(search_url, timeout=10) + response.raise_for_status() + soup = BeautifulSoup(response.text, "html.parser") + + # Simple extraction of titles/links from Google search results + results = [] + # Selector for Google search result titles + for g in soup.select("div.g"): + title_elem = g.select_one("h3") + link_elem = g.select_one("a") + if title_elem and link_elem: + results.append({ + "title": title_elem.get_text(), + "url": link_elem.get("href") + }) + return results + except Exception as e: + logger.error(f"Search failed: {e}") + return [] class NetworkDiscoveryService: @staticmethod @@ -46,38 +51,40 @@ class NetworkDiscoveryService: Performs discovery using Web Crawling. """ try: - crawler = WebCrawler(start_url="https://www.google.com") - data = crawler.crawl(query) + crawler = WebCrawler() + search_results = crawler.search(query) source, _ = Source.objects.get_or_create(name='Web Crawler Engine') - person = None - for ent_data in data.get("entities", []): - entity, _ = Entity.objects.get_or_create( - entity_type=ent_data['type'], - value=ent_data['value'], - source=source - ) - entity.photo_url = f"https://api.dicebear.com/7.x/pixel-art/svg?seed={ent_data['value'].replace(' ', '+')}" - entity.identifier_code = ent_data.get('identifier', 'UNKNOWN') - entity.save() - - if ent_data['type'] == 'PERSON': - person = entity + # 1. Create main entity + person, _ = Entity.objects.get_or_create( + entity_type='PERSON', + value=query, + source=source + ) + person.photo_url = f"https://api.dicebear.com/7.x/pixel-art/svg?seed={query.replace(' ', '+')}" + person.save() - for rel_data in data.get("relationships", []): - s_entity = Entity.objects.filter(value=rel_data['source']).first() - t_entity = Entity.objects.filter(value=rel_data['target']).first() - - if s_entity and t_entity: + # 2. Extract potential associates from titles + for res in search_results: + # Naive associate detection + associate_val = res['title'][:50] + if associate_val != query: + associate, _ = Entity.objects.get_or_create( + entity_type='PERSON', + value=associate_val, + source=source + ) + + # 3. Create relationship Relationship.objects.get_or_create( - source_entity=s_entity, - target_entity=t_entity, - relationship_type=rel_data['type'], - weight=0.9 + source_entity=person, + target_entity=associate, + relationship_type='ASSOCIATED_WITH', + weight=0.5 ) - return person or Entity.objects.filter(value=query).first() + return person except Exception as e: logger.error(f"Error performing web-based discovery for {query}: {e}") @@ -87,4 +94,4 @@ class EntityResolutionService: @staticmethod def resolve_identity(identifier_a, identifier_b, probability_threshold=0.8): # Implementation remains unchanged - return True + return True \ No newline at end of file