39269-vm/core/services/resolution.py

import requests
import logging
from bs4 import BeautifulSoup
from core.models import Entity, Relationship, Source
from urllib.parse import urljoin, quote, unquote
from django.utils import timezone

logger = logging.getLogger(__name__)

class WebCrawler:
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        })

    def fetch_url(self, url):
        """Fetch URL, extract title, meta description, and top images."""
        try:
            logger.info(f"CRAWLER: Fetching {url}")
            response = self.session.get(url, timeout=10)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, "html.parser")

            title = soup.title.string.strip() if soup.title else ""
            desc_tag = soup.find("meta", attrs={"name": "description"})
            description = desc_tag.get("content", "").strip() if desc_tag else ""

            images = []
            for img in soup.find_all("img", limit=5):
                src = img.get("src")
                if src and not src.startswith("data:"):
                    full_src = urljoin(url, src)
                    images.append(full_src)

            return {"title": title, "description": description}, images
        except Exception as e:
            logger.error(f"CRAWLER ERROR: {url}: {e}")
            return None, []

    def search(self, query):
        """Perform a DuckDuckGo search."""
        search_url = f"https://duckduckgo.com/html/?q={quote(query)}"
        try:
            response = self.session.get(search_url, timeout=10)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, "html.parser")

            results = []
            for res in soup.find_all("div", class_="result"):
                a_tag = res.find("a", class_="result__a")
                if a_tag and a_tag.get("href"):
                    href = a_tag.get("href")
                    # Extract real URL from DDG redirection
                    url = ""
                    if "uddg=" in href:
                        url = unquote(href.split("uddg=")[1].split("&")[0])
                    else:
                        url = href

                    results.append({"title": a_tag.get_text(), "url": url})
            return results
        except Exception as e:
            logger.error(f"SEARCH ERROR: {query}: {e}")
            return []

class NetworkDiscoveryService:
    @staticmethod
    def perform_osint_search(query):
        """Perform deep discovery, creating retroactive relationships."""
        crawler = WebCrawler()
        search_results = crawler.search(query)

        source, _ = Source.objects.get_or_create(name='Web Crawler Engine')
        person, _ = Entity.objects.get_or_create(entity_type='PERSON', value=query, source=source)
        person.last_seen = timezone.now()

        # Deep discovery: fetch related entities and link them
        for res in search_results[:6]:
            meta, images = crawler.fetch_url(res['url'])
            if meta:
                # Store metadata in the entity
                person.metadata.update({res['url']: meta['description']})

                associate_val = meta['title'] or res['title']
                if associate_val and associate_val.lower() != query.lower():
                    # Create associate entity
                    associate, _ = Entity.objects.get_or_create(
                        entity_type='PERSON', value=associate_val[:100], source=source
                    )
                    associate.last_seen = timezone.now()
                    associate.save()

                    # Create relationship
                    Relationship.objects.get_or_create(
                        source_entity=person, target_entity=associate,
                        relationship_type='ASSOCIATED_WITH', weight=0.7
                    )

                    # Retroactive check: search associates to find further connections (level 2)
                    second_degree = crawler.search(associate_val)
                    for sec in second_degree[:2]:
                        s_meta, _ = crawler.fetch_url(sec['url'])
                        if s_meta and s_meta['title']:
                            target_val = s_meta['title'][:100]
                            if target_val.lower() != associate_val.lower():
                                target, _ = Entity.objects.get_or_create(
                                    entity_type='PERSON', value=target_val, source=source
                                )
                                Relationship.objects.get_or_create(
                                    source_entity=associate, target_entity=target,
                                    relationship_type='ASSOCIATED_WITH', weight=0.3
                                )

        person.save()
        return person

class EntityResolutionService:
    @staticmethod
    def resolve(data):
        query = data.get('query')
        if query:
            return NetworkDiscoveryService.perform_osint_search(query)
        return None