92 lines
3.3 KiB
Python
92 lines
3.3 KiB
Python
import pyarrow.parquet as pq
|
|
from datetime import datetime, date, timedelta
|
|
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
|
|
import pandas as pd
|
|
import numpy as np
|
|
import subprocess
|
|
|
|
def run_go_scraper(file_name: str, target_date: str, days: int, ticker: str, stock: str):
|
|
# This calls the Go file with arguments
|
|
cmd = ["go", "run", file_name, f"-start={target_date}", f"-days={days}", f"-ticker={ticker}", f"-stock={stock}"]
|
|
|
|
print(f"Executing: {' '.join(cmd)}")
|
|
result = subprocess.run(cmd, capture_output=True, text=True)
|
|
|
|
if result.returncode == 0:
|
|
print("Scraper Success!")
|
|
print(result.stdout)
|
|
else:
|
|
print("Scraper Error:", result.stderr)
|
|
|
|
# Load model and tokenizer once at the module level to save memory/time
|
|
MODEL_NAME = "ProsusAI/finbert"
|
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
|
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
|
|
nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
|
|
|
|
def calculate_weighted_sentiment(file_path, end_date_str, num_days):
|
|
"""
|
|
Calculates the weighted sentiment score from a parquet file of news mentions.
|
|
"""
|
|
label_sign = {"negative": -1, "neutral": 0, "positive": 1}
|
|
end_date = datetime.strptime(end_date_str, "%Y-%m-%d").date()
|
|
|
|
# 1. Load and Filter Data
|
|
table = pq.read_table(file_path, columns=["title", "date"])
|
|
rows = table.to_pylist()
|
|
|
|
# Create date range (Oldest to newest)
|
|
date_list = [(end_date - timedelta(days=x)).strftime("%Y-%m-%d") for x in range(num_days)]
|
|
date_list.reverse()
|
|
|
|
# Group headlines by date
|
|
daily_headlines = {d: [] for d in date_list}
|
|
for row in rows:
|
|
if row["date"] in daily_headlines:
|
|
if row["title"] and row["title"].strip():
|
|
daily_headlines[row["date"]].append(row["title"])
|
|
|
|
# 2. Calculate Sentiment per Day
|
|
daily_sentiment_list = []
|
|
for d in date_list:
|
|
current_headlines = daily_headlines[d]
|
|
|
|
if not current_headlines:
|
|
daily_sentiment_list.append(0.0)
|
|
continue
|
|
|
|
results = nlp(current_headlines)
|
|
|
|
day_scores = []
|
|
for r in results:
|
|
# Applying your specific negative weight multiplier
|
|
multiplier = 2 if r["label"] == "negative" else 1
|
|
score = multiplier * r["score"] * label_sign[r["label"]]
|
|
day_scores.append(score)
|
|
|
|
daily_sentiment_list.append(np.mean(day_scores))
|
|
|
|
# 3. Calculate Hype and Recency
|
|
df = pd.read_parquet(file_path)
|
|
daily_counts = df.groupby('date').size()
|
|
|
|
# Hype list
|
|
max_val = daily_counts.max()
|
|
hype_list = np.exp(daily_counts - max_val)
|
|
|
|
# Recency list
|
|
unique_dates = pd.to_datetime(daily_counts.index)
|
|
most_recent_event = unique_dates.max()
|
|
t_days = (most_recent_event - unique_dates).to_series().dt.days
|
|
recency_list = (1 - np.tanh(t_days / 3)).tolist()
|
|
|
|
# 4. Final Scoring
|
|
weighted_final_scores = np.array(hype_list) * np.array(recency_list) * np.array(daily_sentiment_list)
|
|
final_scores = np.array(hype_list) * np.array(recency_list)
|
|
|
|
# Handle division by zero if lists are empty
|
|
if np.sum(final_scores) == 0:
|
|
return 0.0
|
|
|
|
total_sum = np.sum(weighted_final_scores) / np.sum(final_scores)
|
|
return total_sum |