RIPLEY

2026-03-22 23:51:55 +00:00 · 2026-03-22 23:51:55 +00:00 · ed62ae0c79
commit ed62ae0c79
parent 69b734c063
2 changed files with 55 additions and 10 deletions
--- a/core/services/pycache/resolution.cpython-311.pyc
+++ b/core/services/pycache/resolution.cpython-311.pyc
--- a/core/services/resolution.py
+++ b/core/services/resolution.py
@ -16,6 +16,37 @@ class WebCrawler:
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
        })

+    def fetch_url(self, url):
+        """
+        Fetch URL and extract basic metadata and image links.
+        """
+        try:
+            response = self.session.get(url, timeout=10)
+            response.raise_for_status()
+            soup = BeautifulSoup(response.text, "html.parser")
+            
+            # Extract meta tags
+            metadata = {
+                "title": soup.title.string if soup.title else "No title",
+                "description": soup.find("meta", attrs={"name": "description"}),
+            }
+            if metadata["description"]:
+                metadata["description"] = metadata["description"].get("content", "")
+            else:
+                metadata["description"] = ""
+            
+            # Extract images (top 3)
+            images = []
+            for img in soup.find_all("img", limit=3):
+                src = img.get("src")
+                if src:
+                    images.append(urljoin(url, src))
+            
+            return metadata, images
+        except Exception as e:
+            logger.error(f"Failed to crawl {url}: {e}")
+            return None, []
+
    def search(self, query):
        """
        Perform a simulated search on Google using requests.
@ -28,16 +59,18 @@ class WebCrawler:
            response.raise_for_status()
            soup = BeautifulSoup(response.text, "html.parser")
            
-            # Simple extraction of titles/links from Google search results
            results = []
-            # Selector for Google search result titles
            for g in soup.select("div.g"):
                title_elem = g.select_one("h3")
                link_elem = g.select_one("a")
                if title_elem and link_elem:
+                    url = link_elem.get("href")
+                    # Handle Google's link redirecting
+                    if url.startswith("/url?q="):
+                        url = url.split("/url?q=")[1].split("&")[0]
                    results.append({
                        "title": title_elem.get_text(),
-                        "url": link_elem.get("href")
+                        "url": url
                    })
            return results
        except Exception as e:
@ -48,7 +81,7 @@ class NetworkDiscoveryService:
    @staticmethod
    def perform_osint_search(query):
        """
-        Performs discovery using Web Crawling.
+        Performs discovery using Web Crawling, extracting metadata and images.
        """
        try:
            crawler = WebCrawler()
@ -62,13 +95,22 @@ class NetworkDiscoveryService:
                value=query,
                source=source
            )
+            # Default photo fallback
            person.photo_url = f"https://api.dicebear.com/7.x/pixel-art/svg?seed={query.replace(' ', '+')}"
-            person.save()
            
-            # 2. Extract potential associates from titles
+            # 2. Extract potential associates and crawl their pages
            for res in search_results:
-                # Naive associate detection
-                associate_val = res['title'][:50]
+                metadata, images = crawler.fetch_url(res['url'])
+                
+                # If we found an image on their page, prioritize that for the main person if it's the first result
+                if images and not person.photo_url.startswith("https://api.dicebear.com"):
+                    person.photo_url = images[0]
+                elif images and person.photo_url.startswith("https://api.dicebear.com"):
+                    # For demo purposes, set photo from the first relevant page
+                    person.photo_url = images[0]
+
+                # Create associate
+                associate_val = metadata['title'] if metadata and metadata['title'] != "No title" else res['title'][:50]
                if associate_val != query:
                    associate, _ = Entity.objects.get_or_create(
                        entity_type='PERSON',
@ -76,6 +118,8 @@ class NetworkDiscoveryService:
                        source=source
                    )
                    
+                    # Store link/metadata info if you have a field for it
+                    
                    # 3. Create relationship
                    Relationship.objects.get_or_create(
                        source_entity=person,
@ -84,6 +128,7 @@ class NetworkDiscoveryService:
                        weight=0.5
                    )
            
+            person.save()
            return person

        except Exception as e: