Ripley
This commit is contained in:
parent
ed62ae0c79
commit
23199338ec
Binary file not shown.
@ -7,53 +7,39 @@ from urllib.parse import urljoin, quote
|
|||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
class WebCrawler:
|
class WebCrawler:
|
||||||
"""
|
|
||||||
Crawler to extract information from the web without relying on APIs.
|
|
||||||
"""
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.session = requests.Session()
|
self.session = requests.Session()
|
||||||
self.session.headers.update({
|
self.session.headers.update({
|
||||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||||
})
|
})
|
||||||
|
|
||||||
def fetch_url(self, url):
|
def fetch_url(self, url):
|
||||||
"""
|
"""Fetch URL, extract title, meta description, and top images."""
|
||||||
Fetch URL and extract basic metadata and image links.
|
|
||||||
"""
|
|
||||||
try:
|
try:
|
||||||
|
logger.info(f"Crawling page: {url}")
|
||||||
response = self.session.get(url, timeout=10)
|
response = self.session.get(url, timeout=10)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
soup = BeautifulSoup(response.text, "html.parser")
|
soup = BeautifulSoup(response.text, "html.parser")
|
||||||
|
|
||||||
# Extract meta tags
|
title = soup.title.string.strip() if soup.title else ""
|
||||||
metadata = {
|
desc_tag = soup.find("meta", attrs={"name": "description"})
|
||||||
"title": soup.title.string if soup.title else "No title",
|
description = desc_tag.get("content", "").strip() if desc_tag else ""
|
||||||
"description": soup.find("meta", attrs={"name": "description"}),
|
|
||||||
}
|
|
||||||
if metadata["description"]:
|
|
||||||
metadata["description"] = metadata["description"].get("content", "")
|
|
||||||
else:
|
|
||||||
metadata["description"] = ""
|
|
||||||
|
|
||||||
# Extract images (top 3)
|
|
||||||
images = []
|
images = []
|
||||||
for img in soup.find_all("img", limit=3):
|
for img in soup.find_all("img", limit=5):
|
||||||
src = img.get("src")
|
src = img.get("src")
|
||||||
if src:
|
if src and not src.startswith("data:"):
|
||||||
images.append(urljoin(url, src))
|
full_src = urljoin(url, src)
|
||||||
|
images.append(full_src)
|
||||||
|
|
||||||
return metadata, images
|
return {"title": title, "description": description}, images
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to crawl {url}: {e}")
|
logger.warning(f"Crawling failed for {url}: {e}")
|
||||||
return None, []
|
return None, []
|
||||||
|
|
||||||
def search(self, query):
|
def search(self, query):
|
||||||
"""
|
"""Perform a Google search."""
|
||||||
Perform a simulated search on Google using requests.
|
|
||||||
"""
|
|
||||||
search_url = f"https://www.google.com/search?q={quote(query)}"
|
search_url = f"https://www.google.com/search?q={quote(query)}"
|
||||||
logger.info(f"Crawling URL: {search_url}")
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
response = self.session.get(search_url, timeout=10)
|
response = self.session.get(search_url, timeout=10)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
@ -65,78 +51,44 @@ class WebCrawler:
|
|||||||
link_elem = g.select_one("a")
|
link_elem = g.select_one("a")
|
||||||
if title_elem and link_elem:
|
if title_elem and link_elem:
|
||||||
url = link_elem.get("href")
|
url = link_elem.get("href")
|
||||||
# Handle Google's link redirecting
|
|
||||||
if url.startswith("/url?q="):
|
if url.startswith("/url?q="):
|
||||||
url = url.split("/url?q=")[1].split("&")[0]
|
url = url.split("/url?q=")[1].split("&")[0]
|
||||||
results.append({
|
results.append({"title": title_elem.get_text(), "url": url})
|
||||||
"title": title_elem.get_text(),
|
|
||||||
"url": url
|
|
||||||
})
|
|
||||||
return results
|
return results
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Search failed: {e}")
|
logger.error(f"Search failed for {query}: {e}")
|
||||||
return []
|
return []
|
||||||
|
|
||||||
class NetworkDiscoveryService:
|
class NetworkDiscoveryService:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def perform_osint_search(query):
|
def perform_osint_search(query):
|
||||||
"""
|
"""Perform discovery using Web Crawling, extracting metadata and images."""
|
||||||
Performs discovery using Web Crawling, extracting metadata and images.
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
crawler = WebCrawler()
|
crawler = WebCrawler()
|
||||||
search_results = crawler.search(query)
|
search_results = crawler.search(query)
|
||||||
|
|
||||||
source, _ = Source.objects.get_or_create(name='Web Crawler Engine')
|
source, _ = Source.objects.get_or_create(name='Web Crawler Engine')
|
||||||
|
person, _ = Entity.objects.get_or_create(entity_type='PERSON', value=query, source=source)
|
||||||
|
|
||||||
# 1. Create main entity
|
# Use first valid image found among search results if available
|
||||||
person, _ = Entity.objects.get_or_create(
|
found_photo = None
|
||||||
entity_type='PERSON',
|
|
||||||
value=query,
|
|
||||||
source=source
|
|
||||||
)
|
|
||||||
# Default photo fallback
|
|
||||||
person.photo_url = f"https://api.dicebear.com/7.x/pixel-art/svg?seed={query.replace(' ', '+')}"
|
|
||||||
|
|
||||||
# 2. Extract potential associates and crawl their pages
|
for res in search_results[:3]: # Limit crawling to top 3
|
||||||
for res in search_results:
|
meta, images = crawler.fetch_url(res['url'])
|
||||||
metadata, images = crawler.fetch_url(res['url'])
|
|
||||||
|
|
||||||
# If we found an image on their page, prioritize that for the main person if it's the first result
|
if images and not found_photo:
|
||||||
if images and not person.photo_url.startswith("https://api.dicebear.com"):
|
found_photo = images[0]
|
||||||
person.photo_url = images[0]
|
|
||||||
elif images and person.photo_url.startswith("https://api.dicebear.com"):
|
|
||||||
# For demo purposes, set photo from the first relevant page
|
|
||||||
person.photo_url = images[0]
|
|
||||||
|
|
||||||
# Create associate
|
if meta:
|
||||||
associate_val = metadata['title'] if metadata and metadata['title'] != "No title" else res['title'][:50]
|
associate_val = meta['title'] or res['title']
|
||||||
if associate_val != query:
|
if associate_val and associate_val.lower() != query.lower():
|
||||||
associate, _ = Entity.objects.get_or_create(
|
associate, _ = Entity.objects.get_or_create(
|
||||||
entity_type='PERSON',
|
entity_type='PERSON', value=associate_val, source=source
|
||||||
value=associate_val,
|
|
||||||
source=source
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# Store link/metadata info if you have a field for it
|
|
||||||
|
|
||||||
# 3. Create relationship
|
|
||||||
Relationship.objects.get_or_create(
|
Relationship.objects.get_or_create(
|
||||||
source_entity=person,
|
source_entity=person, target_entity=associate,
|
||||||
target_entity=associate,
|
relationship_type='ASSOCIATED_WITH', weight=0.5
|
||||||
relationship_type='ASSOCIATED_WITH',
|
|
||||||
weight=0.5
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
person.photo_url = found_photo or f"https://api.dicebear.com/7.x/initials/svg?seed={quote(query)}"
|
||||||
person.save()
|
person.save()
|
||||||
return person
|
return person
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error performing web-based discovery for {query}: {e}")
|
|
||||||
return None
|
|
||||||
|
|
||||||
class EntityResolutionService:
|
|
||||||
@staticmethod
|
|
||||||
def resolve_identity(identifier_a, identifier_b, probability_threshold=0.8):
|
|
||||||
# Implementation remains unchanged
|
|
||||||
return True
|
|
||||||
|
|||||||
@ -6,6 +6,7 @@
|
|||||||
.node-group { cursor: pointer; }
|
.node-group { cursor: pointer; }
|
||||||
.node-circle { stroke: #fff; stroke-width: 2px; }
|
.node-circle { stroke: #fff; stroke-width: 2px; }
|
||||||
.node-text { font-size: 10px; pointer-events: none; }
|
.node-text { font-size: 10px; pointer-events: none; }
|
||||||
|
#loader { display: none; }
|
||||||
</style>
|
</style>
|
||||||
|
|
||||||
<div class="container mt-5">
|
<div class="container mt-5">
|
||||||
@ -18,7 +19,12 @@
|
|||||||
<h5 class="card-title">Network Discovery</h5>
|
<h5 class="card-title">Network Discovery</h5>
|
||||||
<form id="searchForm" class="input-group">
|
<form id="searchForm" class="input-group">
|
||||||
<input type="text" id="searchInput" class="form-control" placeholder="Search for a name to map their network...">
|
<input type="text" id="searchInput" class="form-control" placeholder="Search for a name to map their network...">
|
||||||
<button class="btn btn-primary" type="submit">Discover</button>
|
<button class="btn btn-primary" id="searchBtn" type="submit">
|
||||||
|
<span id="btnText">Discover</span>
|
||||||
|
<div id="loader" class="spinner-border spinner-border-sm" role="status">
|
||||||
|
<span class="visually-hidden">Loading...</span>
|
||||||
|
</div>
|
||||||
|
</button>
|
||||||
</form>
|
</form>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
@ -43,13 +49,32 @@ document.getElementById('searchForm').addEventListener('submit', function(e) {
|
|||||||
e.preventDefault();
|
e.preventDefault();
|
||||||
const query = document.getElementById('searchInput').value;
|
const query = document.getElementById('searchInput').value;
|
||||||
const graphContainer = d3.select("#graphContainer");
|
const graphContainer = d3.select("#graphContainer");
|
||||||
graphContainer.html('<p class="p-3">Discovering network...</p>');
|
const searchBtn = document.getElementById('searchBtn');
|
||||||
|
const btnText = document.getElementById('btnText');
|
||||||
|
const loader = document.getElementById('loader');
|
||||||
|
|
||||||
|
// UI Loading state
|
||||||
|
searchBtn.disabled = true;
|
||||||
|
btnText.textContent = "Searching...";
|
||||||
|
loader.style.display = "inline-block";
|
||||||
|
graphContainer.html('<p class="p-3 text-muted">Discovering network, please wait...</p>');
|
||||||
|
|
||||||
fetch(`{% url 'core:search_api' %}?q=${encodeURIComponent(query)}`)
|
fetch(`{% url 'core:search_api' %}?q=${encodeURIComponent(query)}`)
|
||||||
.then(response => response.json())
|
.then(response => {
|
||||||
|
if (!response.ok) throw new Error("Search failed");
|
||||||
|
return response.json();
|
||||||
|
})
|
||||||
.then(data => {
|
.then(data => {
|
||||||
graphContainer.html(''); // clear
|
graphContainer.html(''); // clear
|
||||||
renderGraph(data);
|
renderGraph(data);
|
||||||
|
})
|
||||||
|
.catch(err => {
|
||||||
|
graphContainer.html(`<p class="p-3 text-danger">Error: ${err.message}</p>`);
|
||||||
|
})
|
||||||
|
.finally(() => {
|
||||||
|
searchBtn.disabled = false;
|
||||||
|
btnText.textContent = "Discover";
|
||||||
|
loader.style.display = "none";
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user