124 lines
5.3 KiB
Python
124 lines
5.3 KiB
Python
import requests
|
|
import logging
|
|
from bs4 import BeautifulSoup
|
|
from core.models import Entity, Relationship, Source
|
|
from urllib.parse import urljoin, quote, unquote
|
|
from django.utils import timezone
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class WebCrawler:
|
|
def __init__(self):
|
|
self.session = requests.Session()
|
|
self.session.headers.update({
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
|
})
|
|
|
|
def fetch_url(self, url):
|
|
"""Fetch URL, extract title, meta description, and top images."""
|
|
try:
|
|
logger.info(f"CRAWLER: Fetching {url}")
|
|
response = self.session.get(url, timeout=10)
|
|
response.raise_for_status()
|
|
soup = BeautifulSoup(response.text, "html.parser")
|
|
|
|
title = soup.title.string.strip() if soup.title else ""
|
|
desc_tag = soup.find("meta", attrs={"name": "description"})
|
|
description = desc_tag.get("content", "").strip() if desc_tag else ""
|
|
|
|
images = []
|
|
for img in soup.find_all("img", limit=5):
|
|
src = img.get("src")
|
|
if src and not src.startswith("data:"):
|
|
full_src = urljoin(url, src)
|
|
images.append(full_src)
|
|
|
|
return {"title": title, "description": description}, images
|
|
except Exception as e:
|
|
logger.error(f"CRAWLER ERROR: {url}: {e}")
|
|
return None, []
|
|
|
|
def search(self, query):
|
|
"""Perform a DuckDuckGo search."""
|
|
search_url = f"https://duckduckgo.com/html/?q={quote(query)}"
|
|
try:
|
|
response = self.session.get(search_url, timeout=10)
|
|
response.raise_for_status()
|
|
soup = BeautifulSoup(response.text, "html.parser")
|
|
|
|
results = []
|
|
for res in soup.find_all("div", class_="result"):
|
|
a_tag = res.find("a", class_="result__a")
|
|
if a_tag and a_tag.get("href"):
|
|
href = a_tag.get("href")
|
|
# Extract real URL from DDG redirection
|
|
url = ""
|
|
if "uddg=" in href:
|
|
url = unquote(href.split("uddg=")[1].split("&")[0])
|
|
else:
|
|
url = href
|
|
|
|
results.append({"title": a_tag.get_text(), "url": url})
|
|
return results
|
|
except Exception as e:
|
|
logger.error(f"SEARCH ERROR: {query}: {e}")
|
|
return []
|
|
|
|
class NetworkDiscoveryService:
|
|
@staticmethod
|
|
def perform_osint_search(query):
|
|
"""Perform deep discovery, creating retroactive relationships."""
|
|
crawler = WebCrawler()
|
|
search_results = crawler.search(query)
|
|
|
|
source, _ = Source.objects.get_or_create(name='Web Crawler Engine')
|
|
person, _ = Entity.objects.get_or_create(entity_type='PERSON', value=query, source=source)
|
|
person.last_seen = timezone.now()
|
|
|
|
# Deep discovery: fetch related entities and link them
|
|
for res in search_results[:6]:
|
|
meta, images = crawler.fetch_url(res['url'])
|
|
if meta:
|
|
# Store metadata in the entity
|
|
person.metadata.update({res['url']: meta['description']})
|
|
|
|
associate_val = meta['title'] or res['title']
|
|
if associate_val and associate_val.lower() != query.lower():
|
|
# Create associate entity
|
|
associate, _ = Entity.objects.get_or_create(
|
|
entity_type='PERSON', value=associate_val[:100], source=source
|
|
)
|
|
associate.last_seen = timezone.now()
|
|
associate.save()
|
|
|
|
# Create relationship
|
|
Relationship.objects.get_or_create(
|
|
source_entity=person, target_entity=associate,
|
|
relationship_type='ASSOCIATED_WITH', weight=0.7
|
|
)
|
|
|
|
# Retroactive check: search associates to find further connections (level 2)
|
|
second_degree = crawler.search(associate_val)
|
|
for sec in second_degree[:2]:
|
|
s_meta, _ = crawler.fetch_url(sec['url'])
|
|
if s_meta and s_meta['title']:
|
|
target_val = s_meta['title'][:100]
|
|
if target_val.lower() != associate_val.lower():
|
|
target, _ = Entity.objects.get_or_create(
|
|
entity_type='PERSON', value=target_val, source=source
|
|
)
|
|
Relationship.objects.get_or_create(
|
|
source_entity=associate, target_entity=target,
|
|
relationship_type='ASSOCIATED_WITH', weight=0.3
|
|
)
|
|
|
|
person.save()
|
|
return person
|
|
|
|
class EntityResolutionService:
|
|
@staticmethod
|
|
def resolve(data):
|
|
query = data.get('query')
|
|
if query:
|
|
return NetworkDiscoveryService.perform_osint_search(query)
|
|
return None |