38897-vm/hackathon/sentiment.py

import pyarrow.parquet as pq
from datetime import datetime, date, timedelta
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import pandas as pd
import numpy as np
import subprocess

def run_go_scraper(file_name: str, target_date: str, days: int, ticker: str, stock: str):
    # This calls the Go file with arguments
    cmd = ["go", "run", file_name, f"-start={target_date}", f"-days={days}", f"-ticker={ticker}", f"-stock={stock}"]

    print(f"Executing: {' '.join(cmd)}")
    result = subprocess.run(cmd, capture_output=True, text=True)

    if result.returncode == 0:
        print("Scraper Success!")
        print(result.stdout)
    else:
        print("Scraper Error:", result.stderr)

# Load model and tokenizer once at the module level to save memory/time
MODEL_NAME = "ProsusAI/finbert"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

def calculate_weighted_sentiment(file_path, end_date_str, num_days):
    """
    Calculates the weighted sentiment score from a parquet file of news mentions.
    """
    label_sign = {"negative": -1, "neutral": 0, "positive": 1}
    end_date = datetime.strptime(end_date_str, "%Y-%m-%d").date()

    # 1. Load and Filter Data
    table = pq.read_table(file_path, columns=["title", "date"])
    rows = table.to_pylist()

    # Create date range (Oldest to newest)
    date_list = [(end_date - timedelta(days=x)).strftime("%Y-%m-%d") for x in range(num_days)]
    date_list.reverse()

    # Group headlines by date
    daily_headlines = {d: [] for d in date_list}
    for row in rows:
        if row["date"] in daily_headlines:
            if row["title"] and row["title"].strip():
                daily_headlines[row["date"]].append(row["title"])

    # 2. Calculate Sentiment per Day
    daily_sentiment_list = []
    for d in date_list:
        current_headlines = daily_headlines[d]

        if not current_headlines:
            daily_sentiment_list.append(0.0)
            continue

        results = nlp(current_headlines)

        day_scores = []
        for r in results:
            # Applying your specific negative weight multiplier
            multiplier = 2 if r["label"] == "negative" else 1
            score = multiplier * r["score"] * label_sign[r["label"]]
            day_scores.append(score)

        daily_sentiment_list.append(np.mean(day_scores))

    # 3. Calculate Hype and Recency
    df = pd.read_parquet(file_path)
    daily_counts = df.groupby('date').size()

    # Hype list
    max_val = daily_counts.max()
    hype_list = np.exp(daily_counts - max_val)

    # Recency list
    unique_dates = pd.to_datetime(daily_counts.index)
    most_recent_event = unique_dates.max()
    t_days = (most_recent_event - unique_dates).to_series().dt.days
    recency_list = (1 - np.tanh(t_days / 3)).tolist()

    # 4. Final Scoring
    weighted_final_scores = np.array(hype_list) * np.array(recency_list) * np.array(daily_sentiment_list)
    final_scores = np.array(hype_list) * np.array(recency_list)

    # Handle division by zero if lists are empty
    if np.sum(final_scores) == 0:
        return 0.0

    total_sum = np.sum(weighted_final_scores) / np.sum(final_scores)
    return total_sum