RIPLEY
This commit is contained in:
parent
69b734c063
commit
ed62ae0c79
Binary file not shown.
@ -16,6 +16,37 @@ class WebCrawler:
|
|||||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
|
||||||
})
|
})
|
||||||
|
|
||||||
|
def fetch_url(self, url):
|
||||||
|
"""
|
||||||
|
Fetch URL and extract basic metadata and image links.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
response = self.session.get(url, timeout=10)
|
||||||
|
response.raise_for_status()
|
||||||
|
soup = BeautifulSoup(response.text, "html.parser")
|
||||||
|
|
||||||
|
# Extract meta tags
|
||||||
|
metadata = {
|
||||||
|
"title": soup.title.string if soup.title else "No title",
|
||||||
|
"description": soup.find("meta", attrs={"name": "description"}),
|
||||||
|
}
|
||||||
|
if metadata["description"]:
|
||||||
|
metadata["description"] = metadata["description"].get("content", "")
|
||||||
|
else:
|
||||||
|
metadata["description"] = ""
|
||||||
|
|
||||||
|
# Extract images (top 3)
|
||||||
|
images = []
|
||||||
|
for img in soup.find_all("img", limit=3):
|
||||||
|
src = img.get("src")
|
||||||
|
if src:
|
||||||
|
images.append(urljoin(url, src))
|
||||||
|
|
||||||
|
return metadata, images
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to crawl {url}: {e}")
|
||||||
|
return None, []
|
||||||
|
|
||||||
def search(self, query):
|
def search(self, query):
|
||||||
"""
|
"""
|
||||||
Perform a simulated search on Google using requests.
|
Perform a simulated search on Google using requests.
|
||||||
@ -28,16 +59,18 @@ class WebCrawler:
|
|||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
soup = BeautifulSoup(response.text, "html.parser")
|
soup = BeautifulSoup(response.text, "html.parser")
|
||||||
|
|
||||||
# Simple extraction of titles/links from Google search results
|
|
||||||
results = []
|
results = []
|
||||||
# Selector for Google search result titles
|
|
||||||
for g in soup.select("div.g"):
|
for g in soup.select("div.g"):
|
||||||
title_elem = g.select_one("h3")
|
title_elem = g.select_one("h3")
|
||||||
link_elem = g.select_one("a")
|
link_elem = g.select_one("a")
|
||||||
if title_elem and link_elem:
|
if title_elem and link_elem:
|
||||||
|
url = link_elem.get("href")
|
||||||
|
# Handle Google's link redirecting
|
||||||
|
if url.startswith("/url?q="):
|
||||||
|
url = url.split("/url?q=")[1].split("&")[0]
|
||||||
results.append({
|
results.append({
|
||||||
"title": title_elem.get_text(),
|
"title": title_elem.get_text(),
|
||||||
"url": link_elem.get("href")
|
"url": url
|
||||||
})
|
})
|
||||||
return results
|
return results
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@ -48,7 +81,7 @@ class NetworkDiscoveryService:
|
|||||||
@staticmethod
|
@staticmethod
|
||||||
def perform_osint_search(query):
|
def perform_osint_search(query):
|
||||||
"""
|
"""
|
||||||
Performs discovery using Web Crawling.
|
Performs discovery using Web Crawling, extracting metadata and images.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
crawler = WebCrawler()
|
crawler = WebCrawler()
|
||||||
@ -62,13 +95,22 @@ class NetworkDiscoveryService:
|
|||||||
value=query,
|
value=query,
|
||||||
source=source
|
source=source
|
||||||
)
|
)
|
||||||
|
# Default photo fallback
|
||||||
person.photo_url = f"https://api.dicebear.com/7.x/pixel-art/svg?seed={query.replace(' ', '+')}"
|
person.photo_url = f"https://api.dicebear.com/7.x/pixel-art/svg?seed={query.replace(' ', '+')}"
|
||||||
person.save()
|
|
||||||
|
# 2. Extract potential associates and crawl their pages
|
||||||
# 2. Extract potential associates from titles
|
|
||||||
for res in search_results:
|
for res in search_results:
|
||||||
# Naive associate detection
|
metadata, images = crawler.fetch_url(res['url'])
|
||||||
associate_val = res['title'][:50]
|
|
||||||
|
# If we found an image on their page, prioritize that for the main person if it's the first result
|
||||||
|
if images and not person.photo_url.startswith("https://api.dicebear.com"):
|
||||||
|
person.photo_url = images[0]
|
||||||
|
elif images and person.photo_url.startswith("https://api.dicebear.com"):
|
||||||
|
# For demo purposes, set photo from the first relevant page
|
||||||
|
person.photo_url = images[0]
|
||||||
|
|
||||||
|
# Create associate
|
||||||
|
associate_val = metadata['title'] if metadata and metadata['title'] != "No title" else res['title'][:50]
|
||||||
if associate_val != query:
|
if associate_val != query:
|
||||||
associate, _ = Entity.objects.get_or_create(
|
associate, _ = Entity.objects.get_or_create(
|
||||||
entity_type='PERSON',
|
entity_type='PERSON',
|
||||||
@ -76,6 +118,8 @@ class NetworkDiscoveryService:
|
|||||||
source=source
|
source=source
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Store link/metadata info if you have a field for it
|
||||||
|
|
||||||
# 3. Create relationship
|
# 3. Create relationship
|
||||||
Relationship.objects.get_or_create(
|
Relationship.objects.get_or_create(
|
||||||
source_entity=person,
|
source_entity=person,
|
||||||
@ -84,6 +128,7 @@ class NetworkDiscoveryService:
|
|||||||
weight=0.5
|
weight=0.5
|
||||||
)
|
)
|
||||||
|
|
||||||
|
person.save()
|
||||||
return person
|
return person
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@ -94,4 +139,4 @@ class EntityResolutionService:
|
|||||||
@staticmethod
|
@staticmethod
|
||||||
def resolve_identity(identifier_a, identifier_b, probability_threshold=0.8):
|
def resolve_identity(identifier_a, identifier_b, probability_threshold=0.8):
|
||||||
# Implementation remains unchanged
|
# Implementation remains unchanged
|
||||||
return True
|
return True
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user