diff --git a/core/services/__pycache__/resolution.cpython-311.pyc b/core/services/__pycache__/resolution.cpython-311.pyc index 7d58b8c..19de47f 100644 Binary files a/core/services/__pycache__/resolution.cpython-311.pyc and b/core/services/__pycache__/resolution.cpython-311.pyc differ diff --git a/core/services/resolution.py b/core/services/resolution.py index 571c2a2..c153a4f 100644 --- a/core/services/resolution.py +++ b/core/services/resolution.py @@ -16,6 +16,37 @@ class WebCrawler: "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36" }) + def fetch_url(self, url): + """ + Fetch URL and extract basic metadata and image links. + """ + try: + response = self.session.get(url, timeout=10) + response.raise_for_status() + soup = BeautifulSoup(response.text, "html.parser") + + # Extract meta tags + metadata = { + "title": soup.title.string if soup.title else "No title", + "description": soup.find("meta", attrs={"name": "description"}), + } + if metadata["description"]: + metadata["description"] = metadata["description"].get("content", "") + else: + metadata["description"] = "" + + # Extract images (top 3) + images = [] + for img in soup.find_all("img", limit=3): + src = img.get("src") + if src: + images.append(urljoin(url, src)) + + return metadata, images + except Exception as e: + logger.error(f"Failed to crawl {url}: {e}") + return None, [] + def search(self, query): """ Perform a simulated search on Google using requests. @@ -28,16 +59,18 @@ class WebCrawler: response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") - # Simple extraction of titles/links from Google search results results = [] - # Selector for Google search result titles for g in soup.select("div.g"): title_elem = g.select_one("h3") link_elem = g.select_one("a") if title_elem and link_elem: + url = link_elem.get("href") + # Handle Google's link redirecting + if url.startswith("/url?q="): + url = url.split("/url?q=")[1].split("&")[0] results.append({ "title": title_elem.get_text(), - "url": link_elem.get("href") + "url": url }) return results except Exception as e: @@ -48,7 +81,7 @@ class NetworkDiscoveryService: @staticmethod def perform_osint_search(query): """ - Performs discovery using Web Crawling. + Performs discovery using Web Crawling, extracting metadata and images. """ try: crawler = WebCrawler() @@ -62,13 +95,22 @@ class NetworkDiscoveryService: value=query, source=source ) + # Default photo fallback person.photo_url = f"https://api.dicebear.com/7.x/pixel-art/svg?seed={query.replace(' ', '+')}" - person.save() - - # 2. Extract potential associates from titles + + # 2. Extract potential associates and crawl their pages for res in search_results: - # Naive associate detection - associate_val = res['title'][:50] + metadata, images = crawler.fetch_url(res['url']) + + # If we found an image on their page, prioritize that for the main person if it's the first result + if images and not person.photo_url.startswith("https://api.dicebear.com"): + person.photo_url = images[0] + elif images and person.photo_url.startswith("https://api.dicebear.com"): + # For demo purposes, set photo from the first relevant page + person.photo_url = images[0] + + # Create associate + associate_val = metadata['title'] if metadata and metadata['title'] != "No title" else res['title'][:50] if associate_val != query: associate, _ = Entity.objects.get_or_create( entity_type='PERSON', @@ -76,6 +118,8 @@ class NetworkDiscoveryService: source=source ) + # Store link/metadata info if you have a field for it + # 3. Create relationship Relationship.objects.get_or_create( source_entity=person, @@ -84,6 +128,7 @@ class NetworkDiscoveryService: weight=0.5 ) + person.save() return person except Exception as e: @@ -94,4 +139,4 @@ class EntityResolutionService: @staticmethod def resolve_identity(identifier_a, identifier_b, probability_threshold=0.8): # Implementation remains unchanged - return True \ No newline at end of file + return True