This commit is contained in:
Flatlogic Bot 2026-03-22 23:51:55 +00:00
parent 69b734c063
commit ed62ae0c79
2 changed files with 55 additions and 10 deletions

View File

@ -16,6 +16,37 @@ class WebCrawler:
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36" "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
}) })
def fetch_url(self, url):
"""
Fetch URL and extract basic metadata and image links.
"""
try:
response = self.session.get(url, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
# Extract meta tags
metadata = {
"title": soup.title.string if soup.title else "No title",
"description": soup.find("meta", attrs={"name": "description"}),
}
if metadata["description"]:
metadata["description"] = metadata["description"].get("content", "")
else:
metadata["description"] = ""
# Extract images (top 3)
images = []
for img in soup.find_all("img", limit=3):
src = img.get("src")
if src:
images.append(urljoin(url, src))
return metadata, images
except Exception as e:
logger.error(f"Failed to crawl {url}: {e}")
return None, []
def search(self, query): def search(self, query):
""" """
Perform a simulated search on Google using requests. Perform a simulated search on Google using requests.
@ -28,16 +59,18 @@ class WebCrawler:
response.raise_for_status() response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser") soup = BeautifulSoup(response.text, "html.parser")
# Simple extraction of titles/links from Google search results
results = [] results = []
# Selector for Google search result titles
for g in soup.select("div.g"): for g in soup.select("div.g"):
title_elem = g.select_one("h3") title_elem = g.select_one("h3")
link_elem = g.select_one("a") link_elem = g.select_one("a")
if title_elem and link_elem: if title_elem and link_elem:
url = link_elem.get("href")
# Handle Google's link redirecting
if url.startswith("/url?q="):
url = url.split("/url?q=")[1].split("&")[0]
results.append({ results.append({
"title": title_elem.get_text(), "title": title_elem.get_text(),
"url": link_elem.get("href") "url": url
}) })
return results return results
except Exception as e: except Exception as e:
@ -48,7 +81,7 @@ class NetworkDiscoveryService:
@staticmethod @staticmethod
def perform_osint_search(query): def perform_osint_search(query):
""" """
Performs discovery using Web Crawling. Performs discovery using Web Crawling, extracting metadata and images.
""" """
try: try:
crawler = WebCrawler() crawler = WebCrawler()
@ -62,13 +95,22 @@ class NetworkDiscoveryService:
value=query, value=query,
source=source source=source
) )
# Default photo fallback
person.photo_url = f"https://api.dicebear.com/7.x/pixel-art/svg?seed={query.replace(' ', '+')}" person.photo_url = f"https://api.dicebear.com/7.x/pixel-art/svg?seed={query.replace(' ', '+')}"
person.save()
# 2. Extract potential associates and crawl their pages
# 2. Extract potential associates from titles
for res in search_results: for res in search_results:
# Naive associate detection metadata, images = crawler.fetch_url(res['url'])
associate_val = res['title'][:50]
# If we found an image on their page, prioritize that for the main person if it's the first result
if images and not person.photo_url.startswith("https://api.dicebear.com"):
person.photo_url = images[0]
elif images and person.photo_url.startswith("https://api.dicebear.com"):
# For demo purposes, set photo from the first relevant page
person.photo_url = images[0]
# Create associate
associate_val = metadata['title'] if metadata and metadata['title'] != "No title" else res['title'][:50]
if associate_val != query: if associate_val != query:
associate, _ = Entity.objects.get_or_create( associate, _ = Entity.objects.get_or_create(
entity_type='PERSON', entity_type='PERSON',
@ -76,6 +118,8 @@ class NetworkDiscoveryService:
source=source source=source
) )
# Store link/metadata info if you have a field for it
# 3. Create relationship # 3. Create relationship
Relationship.objects.get_or_create( Relationship.objects.get_or_create(
source_entity=person, source_entity=person,
@ -84,6 +128,7 @@ class NetworkDiscoveryService:
weight=0.5 weight=0.5
) )
person.save()
return person return person
except Exception as e: except Exception as e:
@ -94,4 +139,4 @@ class EntityResolutionService:
@staticmethod @staticmethod
def resolve_identity(identifier_a, identifier_b, probability_threshold=0.8): def resolve_identity(identifier_a, identifier_b, probability_threshold=0.8):
# Implementation remains unchanged # Implementation remains unchanged
return True return True