Initial import

2026-03-01 01:53:03 +00:00 · 2026-03-01 01:53:03 +00:00 · b54ba42545
commit b54ba42545
15 changed files with 1446 additions and 0 deletions
--- a/__MACOSX/hackathon/._.DS_Store
+++ b/__MACOSX/hackathon/._.DS_Store
--- a/__MACOSX/hackathon/._go.mod
+++ b/__MACOSX/hackathon/._go.mod
--- a/__MACOSX/hackathon/._go.sum
+++ b/__MACOSX/hackathon/._go.sum
--- a/__MACOSX/hackathon/._mentions.parquet
+++ b/__MACOSX/hackathon/._mentions.parquet
--- a/__MACOSX/hackathon/._sentiment.py
+++ b/__MACOSX/hackathon/._sentiment.py
--- a/__MACOSX/hackathon/._tsallis.py
+++ b/__MACOSX/hackathon/._tsallis.py
--- a/__MACOSX/hackathon/._yahooscrape.go
+++ b/__MACOSX/hackathon/._yahooscrape.go
--- a/hackathon/.DS_Store
+++ b/hackathon/.DS_Store
--- a/hackathon/pycache/sentiment.cpython-310.pyc
+++ b/hackathon/pycache/sentiment.cpython-310.pyc
--- a/hackathon/go.mod
+++ b/hackathon/go.mod
@ -0,0 +1,24 @@
 module finnhub_scraper
 go 1.25.0
 require (
 	github.com/vartanbeno/go-reddit/v2 v2.0.1
 	github.com/xitongsys/parquet-go v1.6.2
 	github.com/xitongsys/parquet-go-source v0.0.0-20241021075129-b732d2ac9c9b
 )
 require (
 	github.com/apache/arrow/go/arrow v0.0.0-20200730104253-651201b0f516 // indirect
 	github.com/apache/thrift v0.14.2 // indirect
 	github.com/golang/protobuf v1.5.2 // indirect
 	github.com/golang/snappy v0.0.3 // indirect
 	github.com/google/go-querystring v1.0.0 // indirect
 	github.com/klauspost/compress v1.15.9 // indirect
 	github.com/pierrec/lz4/v4 v4.1.8 // indirect
 	golang.org/x/net v0.10.0 // indirect
 	golang.org/x/oauth2 v0.0.0-20220309155454-6242fa91716a // indirect
 	golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect
 	google.golang.org/appengine v1.6.7 // indirect
 	google.golang.org/protobuf v1.28.0 // indirect
 )
--- a/hackathon/go.sum
+++ b/hackathon/go.sum
--- a/hackathon/mentions.parquet
+++ b/hackathon/mentions.parquet
--- a/hackathon/sentiment.py
+++ b/hackathon/sentiment.py
@ -0,0 +1,92 @@
 import pyarrow.parquet as pq
 from datetime import datetime, date, timedelta
 from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
 import pandas as pd
 import numpy as np
 import subprocess
 def run_go_scraper(file_name: str, target_date: str, days: int, ticker: str, stock: str):
    # This calls the Go file with arguments
    cmd = ["go", "run", file_name, f"-start={target_date}", f"-days={days}", f"-ticker={ticker}", f"-stock={stock}"]
    print(f"Executing: {' '.join(cmd)}")
    result = subprocess.run(cmd, capture_output=True, text=True)
    if result.returncode == 0:
        print("Scraper Success!")
        print(result.stdout)
    else:
        print("Scraper Error:", result.stderr)
 # Load model and tokenizer once at the module level to save memory/time
 MODEL_NAME = "ProsusAI/finbert"
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
 nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
 def calculate_weighted_sentiment(file_path, end_date_str, num_days):
    """
    Calculates the weighted sentiment score from a parquet file of news mentions.
    """
    label_sign = {"negative": -1, "neutral": 0, "positive": 1}
    end_date = datetime.strptime(end_date_str, "%Y-%m-%d").date()
    # 1. Load and Filter Data
    table = pq.read_table(file_path, columns=["title", "date"])
    rows = table.to_pylist()
    # Create date range (Oldest to newest)
    date_list = [(end_date - timedelta(days=x)).strftime("%Y-%m-%d") for x in range(num_days)]
    date_list.reverse()
    # Group headlines by date
    daily_headlines = {d: [] for d in date_list}
    for row in rows:
        if row["date"] in daily_headlines:
            if row["title"] and row["title"].strip():
                daily_headlines[row["date"]].append(row["title"])
    # 2. Calculate Sentiment per Day
    daily_sentiment_list = []
    for d in date_list:
        current_headlines = daily_headlines[d]
        if not current_headlines:
            daily_sentiment_list.append(0.0)
            continue
        results = nlp(current_headlines)
        day_scores = []
        for r in results:
            # Applying your specific negative weight multiplier
            multiplier = 2 if r["label"] == "negative" else 1
            score = multiplier * r["score"] * label_sign[r["label"]]
            day_scores.append(score)
        daily_sentiment_list.append(np.mean(day_scores))
    # 3. Calculate Hype and Recency
    df = pd.read_parquet(file_path)
    daily_counts = df.groupby('date').size()
    # Hype list
    max_val = daily_counts.max()
    hype_list = np.exp(daily_counts - max_val)
    # Recency list
    unique_dates = pd.to_datetime(daily_counts.index)
    most_recent_event = unique_dates.max()
    t_days = (most_recent_event - unique_dates).to_series().dt.days
    recency_list = (1 - np.tanh(t_days / 3)).tolist()
    # 4. Final Scoring
    weighted_final_scores = np.array(hype_list) * np.array(recency_list) * np.array(daily_sentiment_list)
    final_scores = np.array(hype_list) * np.array(recency_list)
    # Handle division by zero if lists are empty
    if np.sum(final_scores) == 0:
        return 0.0
    total_sum = np.sum(weighted_final_scores) / np.sum(final_scores)
    return total_sum
--- a/hackathon/tsallis.py
+++ b/hackathon/tsallis.py
@ -0,0 +1,169 @@
 import numpy as np
 import pandas as pd
 import yfinance as yf
 from dataclasses import dataclass
 from sentiment import calculate_weighted_sentiment, run_go_scraper
 from datetime import datetime, timedelta
 # ── Config ────────────────────────────────────────────────────────────────
 Q             = 0.5       # Tsallis index (q<1 amplifies fat tails)
 LAMBDA_DECAY  = 0.97      # Exponential decay for time weighting
 WINDOW        = 60        # Rolling window (bars)
 N_BINS        = 15        # Histogram bins
 BIN_SCALE     = 3.5       # Bins span ±3.5 × historical std
 MULTI_WINDOWS = (30, 60, 120)  # Multi-scale averaging
 PARK_WEIGHT   = 0.3       # 0.0 = pure Tsallis, 1.0 = pure Parkinson
 MIN_CAL       = 120       # Minimum bars before first prediction
 # ── Parkinson Range Volatility ────────────────────────────────────────────
 def _parkinson(highs: np.ndarray, lows: np.ndarray) -> np.ndarray:
    n = len(highs)
    out = np.full(n, np.nan)
    safe_lows = np.where(lows > 0, lows, highs)
    log_hl_sq = np.log(np.where(highs > safe_lows, highs / safe_lows, 1.0)) ** 2
    for i in range(WINDOW, n):
        out[i] = np.sqrt(np.mean(log_hl_sq[i - WINDOW:i]) / (4.0 * np.log(2.0)))
    return out
 def _norm(value: float, past: np.ndarray) -> float:
    clean = past[~np.isnan(past)]
    if len(clean) < 20 or np.std(clean) == 0:
        return 0.5
    z = (value - np.mean(clean)) / np.std(clean)
    return float(np.clip(np.tanh(z / 2.0 + 0.5), 0.0, 1.0))
 # ── Tsallis Entropy Core ─────────────────────────────────────────────────
 def _weights(T: int) -> np.ndarray:
    return LAMBDA_DECAY ** np.arange(T, 0, -1, dtype=np.float64)
 def _weighted_probs(returns: np.ndarray, weights: np.ndarray, edges: np.ndarray) -> np.ndarray:
    n_bins = len(edges) - 1
    wp = np.zeros(n_bins, dtype=np.float64)
    idx = np.clip(np.digitize(returns, edges) - 1, 0, n_bins - 1)
    np.add.at(wp, idx, weights)
    s = weights.sum()
    if s > 0:
        wp /= s
    return wp
 def _entropy(wp: np.ndarray) -> float:
    nz = wp[wp > 0]
    return (1.0 - np.sum(nz ** Q)) / (Q - 1.0) if len(nz) > 0 else 0.0
 def _max_entropy() -> float:
    return (N_BINS ** (1.0 - Q) - 1.0) / (1.0 - Q)
 def _tsallis_score(returns: np.ndarray, edges: np.ndarray, window: int) -> float:
    eff = returns[-window:] if len(returns) >= window else returns
    wp = _weighted_probs(eff, _weights(len(eff)), edges)
    mx = _max_entropy()
    return float(np.clip(_entropy(wp) / mx, 0.0, 1.0)) if mx else 0.0
 # ── Public API ────────────────────────────────────────────────────────────
@dataclass
 class Result:
    score: float      # Volatility/Entropy [0, 1]
    sent: float       # News Sentiment [-1, 1]
    trade_signal: float # sentiment * score
    regime: str       # LOW / MODERATE / HIGH
 def get_score(ticker_symbol: str, stock_name: str, parquet_path: str, end_dt_str: str, days_lookback: int, volatility_time: int, interval: str = "1h") -> Result:
    # FIX: Parse the input string correctly using the argument 'end_dt_str'
    end_dt = datetime.strptime(end_dt_str, "%Y-%m-%d")
    start_dt = end_dt - timedelta(days=volatility_time) 
    # 2. Fetch Price Data
    df = yf.Ticker(ticker_symbol).history(
        start=start_dt.strftime("%Y-%m-%d"),
        end=end_dt.strftime("%Y-%m-%d"),
        interval=interval,
        auto_adjust=True
    )
    if df.empty:
        raise ValueError(f"No data for '{ticker_symbol}'.")
    df["log_return"] = np.log(df["Close"] / df["Close"].shift(1))
    df = df.dropna(subset=["log_return"])
    returns = df["log_return"].values
    highs = df["High"].values
    lows = df["Low"].values
    if len(returns) < MIN_CAL:
        raise ValueError(f"Need {MIN_CAL} bars, got {len(returns)}.")
    # 3. Calculate Tsallis Volatility
    r = returns[:-1].astype(np.float64)
    half = BIN_SCALE * np.std(r)
    edges = np.linspace(np.mean(r) - half, np.mean(r) + half, N_BINS + 1)
    ts = [_tsallis_score(returns, edges, w) for w in MULTI_WINDOWS if len(returns) >= w]
    tsallis = float(np.mean(ts)) if ts else 0.5
    # 4. Calculate Parkinson Volatility
    park = 0.5
    if PARK_WEIGHT > 0:
        pv = _parkinson(highs, lows)
        if not np.isnan(pv[-1]):
            park = _norm(pv[-1], pv[:-1])
    # 5. Final Vol Score & Sentiment Integration
    vol_score = float(np.clip((1.0 - PARK_WEIGHT) * tsallis + PARK_WEIGHT * park, 0.0, 1.0))
    # Pass correct variables to sentiment function
    sent = calculate_weighted_sentiment(parquet_path, end_date_str=end_dt_str, num_days=days_lookback)
    trade_signal = sent * vol_score
    regime = "LOW" if vol_score < 0.3 else "MODERATE" if vol_score < 0.6 else "HIGH"
    return Result(
        score=vol_score, 
        sent=sent, 
        trade_signal=trade_signal, 
        regime=regime
    )
 # ── Execution ─────────────────────────────────────────────────────────────
 if __name__ == "__main__":
    # 1. Define targets dynamically
    target_ticker = str(input("Stock ticker: "))      # Ticker Symbol
    target_stock = str(input("Stock name: "))      # Search keyword for headlines
    lookback = int(input("Sentiment lookback: "))
    run_date = (datetime.now()).strftime("%Y-%m-%d")
    volatility_lookback = int(input("Volatility lookback: "))
    # 2. Run the updated Go Scraper (Now with 5 arguments)
    run_go_scraper("yahooscrape.go", run_date, lookback, target_ticker, target_stock)
    # 3. Run Analysis
    try:
        res = get_score(
            ticker_symbol=target_ticker, 
            stock_name=target_stock, 
            parquet_path="mentions.parquet", 
            end_dt_str=run_date, 
            days_lookback=lookback,
            volatility_time=volatility_lookback
        )
        print(f"\n--- {target_ticker} ({res.regime} VOLATILITY) ---")
        print(f"Vol Score:     {res.score:.4f}")
        print(f"Sentiment:     {res.sent:.4f}")
        print(f"Trade Signal:  {res.trade_signal:.4f}")
        # Verdict logic
        if res.score < 0.3:
            print("Verdict: SIT OUT (Low Activity)")
        elif res.score > 0.6:
            action = "BUY" if res.trade_signal > 0 else "SELL"
            print(f"Verdict: FULL CONVICTION {action}")
        else:
            print("Verdict: MODERATE (Monitor/Small Position)")
    except Exception as e:
        print(f"Pipeline Error: {e}")
--- a/hackathon/yahooscrape.go
+++ b/hackathon/yahooscrape.go
@ -0,0 +1,111 @@
 package main
 import (
 	"encoding/json"
 	"flag"
 	"fmt"
 	"log"
 	"net/http"
 	"strings"
 	"time"
 	"github.com/xitongsys/parquet-go-source/local"
 	"github.com/xitongsys/parquet-go/writer"
 )
 type FinnhubNews struct {
 	Datetime int64  `json:"datetime"`
 	Headline string `json:"headline"`
 	Source   string `json:"source"`
 }
 type NewsRecord struct {
 	Title  string `parquet:"name=title, type=BYTE_ARRAY, convertedtype=UTF8, encoding=PLAIN_DICTIONARY"`
 	Source string `parquet:"name=source, type=BYTE_ARRAY, convertedtype=UTF8, encoding=PLAIN_DICTIONARY"`
 	Date   string `parquet:"name=date, type=BYTE_ARRAY, convertedtype=UTF8, encoding=PLAIN_DICTIONARY"`
 }
 func main() {
 	// 1. Define flags for input
 	startDatePtr := flag.String("start", "Na", "Start date in YYYY-MM-DD format")
 	daysPtr := flag.Int("days", 0, "Number of days to look back")
 	tickerPtr := flag.String("ticker", "", "Stock Ticker")
 	stockPtr := flag.String("stock", "", "Stock Name")
 	flag.Parse()
 	apiKey := "d6hkh6hr01qr5k4cb93gd6hkh6hr01qr5k4cb940"
 	// 2. Parse the input start date
 	baseDate, err := time.Parse("2006-01-02", *startDatePtr)
 	if err != nil {
 		log.Fatalf("Invalid date format. Use YYYY-MM-DD: %v", err)
 	}
 	fw, err := local.NewLocalFileWriter("mentions.parquet")
 	if err != nil {
 		log.Fatal(err)
 	}
 	pw, err := writer.NewParquetWriter(fw, new(NewsRecord), 4)
 	if err != nil {
 		log.Fatal(err)
 	}
 	for i := 0; i < *daysPtr; i++ {
 		// 3. Subtract days from the PROVIDED start date instead of 'Now'
 		currentDate := baseDate.AddDate(0, 0, -i)
 		dateStr := currentDate.Format("2006-01-02")
 		fmt.Printf("Checking %s... ", dateStr)
 		// Updated symbol parameter to 'ticker'
 		url := fmt.Sprintf(
 			"https://finnhub.io/api/v1/company-news?symbol=%s&from=%s&to=%s&token=%s",
 			*tickerPtr, dateStr, dateStr, apiKey,
 		)
 		resp, err := http.Get(url)
 		if err != nil {
 			fmt.Printf("Network Error: %v\n", err)
 			continue
 		}
 		var news []FinnhubNews
 		if err := json.NewDecoder(resp.Body).Decode(&news); err != nil {
 			resp.Body.Close()
 			fmt.Printf("Decode Error\n")
 			continue
 		}
 		resp.Body.Close()
 		matchCount := 0
 		for _, article := range news {
 			h := strings.ToLower(article.Headline)
 			// Dynamically check for the stock name or ticker passed in via flags
 			if strings.Contains(h, strings.ToLower(*stockPtr)) || strings.Contains(h, strings.ToLower(*tickerPtr)) {
 				record := NewsRecord{
 					Title:  article.Headline,
 					Source: article.Source,
 					Date:   dateStr,
 				}
 				pw.Write(record)
 				matchCount++
 			}
 		}
 		if matchCount == 0 {
 			fmt.Print("No matches. Writing placeholder.")
 			pw.Write(NewsRecord{Title: "", Source: "NONE", Date: dateStr})
 		} else {
 			fmt.Printf("Found %d matches.", matchCount)
 		}
 		fmt.Println()
 		time.Sleep(1 * time.Second)
 	}
 	pw.WriteStop()
 	fw.Close()
 	fmt.Println("\nFinished! Check mentions.parquet")
 }