97 lines
3.4 KiB
Python
97 lines
3.4 KiB
Python
import requests
|
|
import logging
|
|
from bs4 import BeautifulSoup
|
|
from core.models import Entity, Relationship, Source
|
|
from urllib.parse import urljoin, quote
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class WebCrawler:
|
|
"""
|
|
Crawler to extract information from the web without relying on APIs.
|
|
"""
|
|
def __init__(self):
|
|
self.session = requests.Session()
|
|
self.session.headers.update({
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
|
|
})
|
|
|
|
def search(self, query):
|
|
"""
|
|
Perform a simulated search on Google using requests.
|
|
"""
|
|
search_url = f"https://www.google.com/search?q={quote(query)}"
|
|
logger.info(f"Crawling URL: {search_url}")
|
|
|
|
try:
|
|
response = self.session.get(search_url, timeout=10)
|
|
response.raise_for_status()
|
|
soup = BeautifulSoup(response.text, "html.parser")
|
|
|
|
# Simple extraction of titles/links from Google search results
|
|
results = []
|
|
# Selector for Google search result titles
|
|
for g in soup.select("div.g"):
|
|
title_elem = g.select_one("h3")
|
|
link_elem = g.select_one("a")
|
|
if title_elem and link_elem:
|
|
results.append({
|
|
"title": title_elem.get_text(),
|
|
"url": link_elem.get("href")
|
|
})
|
|
return results
|
|
except Exception as e:
|
|
logger.error(f"Search failed: {e}")
|
|
return []
|
|
|
|
class NetworkDiscoveryService:
|
|
@staticmethod
|
|
def perform_osint_search(query):
|
|
"""
|
|
Performs discovery using Web Crawling.
|
|
"""
|
|
try:
|
|
crawler = WebCrawler()
|
|
search_results = crawler.search(query)
|
|
|
|
source, _ = Source.objects.get_or_create(name='Web Crawler Engine')
|
|
|
|
# 1. Create main entity
|
|
person, _ = Entity.objects.get_or_create(
|
|
entity_type='PERSON',
|
|
value=query,
|
|
source=source
|
|
)
|
|
person.photo_url = f"https://api.dicebear.com/7.x/pixel-art/svg?seed={query.replace(' ', '+')}"
|
|
person.save()
|
|
|
|
# 2. Extract potential associates from titles
|
|
for res in search_results:
|
|
# Naive associate detection
|
|
associate_val = res['title'][:50]
|
|
if associate_val != query:
|
|
associate, _ = Entity.objects.get_or_create(
|
|
entity_type='PERSON',
|
|
value=associate_val,
|
|
source=source
|
|
)
|
|
|
|
# 3. Create relationship
|
|
Relationship.objects.get_or_create(
|
|
source_entity=person,
|
|
target_entity=associate,
|
|
relationship_type='ASSOCIATED_WITH',
|
|
weight=0.5
|
|
)
|
|
|
|
return person
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error performing web-based discovery for {query}: {e}")
|
|
return None
|
|
|
|
class EntityResolutionService:
|
|
@staticmethod
|
|
def resolve_identity(identifier_a, identifier_b, probability_threshold=0.8):
|
|
# Implementation remains unchanged
|
|
return True |