import pyarrow.parquet as pq from datetime import datetime, date, timedelta from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline import pandas as pd import numpy as np import subprocess def run_go_scraper(file_name: str, target_date: str, days: int, ticker: str, stock: str): # This calls the Go file with arguments cmd = ["go", "run", file_name, f"-start={target_date}", f"-days={days}", f"-ticker={ticker}", f"-stock={stock}"] print(f"Executing: {' '.join(cmd)}") result = subprocess.run(cmd, capture_output=True, text=True) if result.returncode == 0: print("Scraper Success!") print(result.stdout) else: print("Scraper Error:", result.stderr) # Load model and tokenizer once at the module level to save memory/time MODEL_NAME = "ProsusAI/finbert" tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME) nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer) def calculate_weighted_sentiment(file_path, end_date_str, num_days): """ Calculates the weighted sentiment score from a parquet file of news mentions. """ label_sign = {"negative": -1, "neutral": 0, "positive": 1} end_date = datetime.strptime(end_date_str, "%Y-%m-%d").date() # 1. Load and Filter Data table = pq.read_table(file_path, columns=["title", "date"]) rows = table.to_pylist() # Create date range (Oldest to newest) date_list = [(end_date - timedelta(days=x)).strftime("%Y-%m-%d") for x in range(num_days)] date_list.reverse() # Group headlines by date daily_headlines = {d: [] for d in date_list} for row in rows: if row["date"] in daily_headlines: if row["title"] and row["title"].strip(): daily_headlines[row["date"]].append(row["title"]) # 2. Calculate Sentiment per Day daily_sentiment_list = [] for d in date_list: current_headlines = daily_headlines[d] if not current_headlines: daily_sentiment_list.append(0.0) continue results = nlp(current_headlines) day_scores = [] for r in results: # Applying your specific negative weight multiplier multiplier = 2 if r["label"] == "negative" else 1 score = multiplier * r["score"] * label_sign[r["label"]] day_scores.append(score) daily_sentiment_list.append(np.mean(day_scores)) # 3. Calculate Hype and Recency df = pd.read_parquet(file_path) daily_counts = df.groupby('date').size() # Hype list max_val = daily_counts.max() hype_list = np.exp(daily_counts - max_val) # Recency list unique_dates = pd.to_datetime(daily_counts.index) most_recent_event = unique_dates.max() t_days = (most_recent_event - unique_dates).to_series().dt.days recency_list = (1 - np.tanh(t_days / 3)).tolist() # 4. Final Scoring weighted_final_scores = np.array(hype_list) * np.array(recency_list) * np.array(daily_sentiment_list) final_scores = np.array(hype_list) * np.array(recency_list) # Handle division by zero if lists are empty if np.sum(final_scores) == 0: return 0.0 total_sum = np.sum(weighted_final_scores) / np.sum(final_scores) return total_sum