38897-vm/hackathon/sentiment.py
2026-03-01 01:53:03 +00:00

92 lines
3.3 KiB
Python

import pyarrow.parquet as pq
from datetime import datetime, date, timedelta
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import pandas as pd
import numpy as np
import subprocess
def run_go_scraper(file_name: str, target_date: str, days: int, ticker: str, stock: str):
# This calls the Go file with arguments
cmd = ["go", "run", file_name, f"-start={target_date}", f"-days={days}", f"-ticker={ticker}", f"-stock={stock}"]
print(f"Executing: {' '.join(cmd)}")
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode == 0:
print("Scraper Success!")
print(result.stdout)
else:
print("Scraper Error:", result.stderr)
# Load model and tokenizer once at the module level to save memory/time
MODEL_NAME = "ProsusAI/finbert"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
def calculate_weighted_sentiment(file_path, end_date_str, num_days):
"""
Calculates the weighted sentiment score from a parquet file of news mentions.
"""
label_sign = {"negative": -1, "neutral": 0, "positive": 1}
end_date = datetime.strptime(end_date_str, "%Y-%m-%d").date()
# 1. Load and Filter Data
table = pq.read_table(file_path, columns=["title", "date"])
rows = table.to_pylist()
# Create date range (Oldest to newest)
date_list = [(end_date - timedelta(days=x)).strftime("%Y-%m-%d") for x in range(num_days)]
date_list.reverse()
# Group headlines by date
daily_headlines = {d: [] for d in date_list}
for row in rows:
if row["date"] in daily_headlines:
if row["title"] and row["title"].strip():
daily_headlines[row["date"]].append(row["title"])
# 2. Calculate Sentiment per Day
daily_sentiment_list = []
for d in date_list:
current_headlines = daily_headlines[d]
if not current_headlines:
daily_sentiment_list.append(0.0)
continue
results = nlp(current_headlines)
day_scores = []
for r in results:
# Applying your specific negative weight multiplier
multiplier = 2 if r["label"] == "negative" else 1
score = multiplier * r["score"] * label_sign[r["label"]]
day_scores.append(score)
daily_sentiment_list.append(np.mean(day_scores))
# 3. Calculate Hype and Recency
df = pd.read_parquet(file_path)
daily_counts = df.groupby('date').size()
# Hype list
max_val = daily_counts.max()
hype_list = np.exp(daily_counts - max_val)
# Recency list
unique_dates = pd.to_datetime(daily_counts.index)
most_recent_event = unique_dates.max()
t_days = (most_recent_event - unique_dates).to_series().dt.days
recency_list = (1 - np.tanh(t_days / 3)).tolist()
# 4. Final Scoring
weighted_final_scores = np.array(hype_list) * np.array(recency_list) * np.array(daily_sentiment_list)
final_scores = np.array(hype_list) * np.array(recency_list)
# Handle division by zero if lists are empty
if np.sum(final_scores) == 0:
return 0.0
total_sum = np.sum(weighted_final_scores) / np.sum(final_scores)
return total_sum