Initial import

2026-03-01 02:44:35 +00:00 · 2026-03-01 02:44:35 +00:00 · cb0542cc7c
commit cb0542cc7c
15 changed files with 1446 additions and 0 deletions
--- a/__MACOSX/hackathon/._.DS_Store
+++ b/__MACOSX/hackathon/._.DS_Store
--- a/__MACOSX/hackathon/._go.mod
+++ b/__MACOSX/hackathon/._go.mod
--- a/__MACOSX/hackathon/._go.sum
+++ b/__MACOSX/hackathon/._go.sum
--- a/__MACOSX/hackathon/._mentions.parquet
+++ b/__MACOSX/hackathon/._mentions.parquet
--- a/__MACOSX/hackathon/._sentiment.py
+++ b/__MACOSX/hackathon/._sentiment.py
--- a/__MACOSX/hackathon/._tsallis.py
+++ b/__MACOSX/hackathon/._tsallis.py
--- a/__MACOSX/hackathon/._yahooscrape.go
+++ b/__MACOSX/hackathon/._yahooscrape.go
--- a/hackathon/.DS_Store
+++ b/hackathon/.DS_Store
--- a/hackathon/pycache/sentiment.cpython-310.pyc
+++ b/hackathon/pycache/sentiment.cpython-310.pyc
--- a/hackathon/go.mod
+++ b/hackathon/go.mod
@ -0,0 +1,24 @@
+module finnhub_scraper
+
+go 1.25.0
+
+require (
+	github.com/vartanbeno/go-reddit/v2 v2.0.1
+	github.com/xitongsys/parquet-go v1.6.2
+	github.com/xitongsys/parquet-go-source v0.0.0-20241021075129-b732d2ac9c9b
+)
+
+require (
+	github.com/apache/arrow/go/arrow v0.0.0-20200730104253-651201b0f516 // indirect
+	github.com/apache/thrift v0.14.2 // indirect
+	github.com/golang/protobuf v1.5.2 // indirect
+	github.com/golang/snappy v0.0.3 // indirect
+	github.com/google/go-querystring v1.0.0 // indirect
+	github.com/klauspost/compress v1.15.9 // indirect
+	github.com/pierrec/lz4/v4 v4.1.8 // indirect
+	golang.org/x/net v0.10.0 // indirect
+	golang.org/x/oauth2 v0.0.0-20220309155454-6242fa91716a // indirect
+	golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect
+	google.golang.org/appengine v1.6.7 // indirect
+	google.golang.org/protobuf v1.28.0 // indirect
+)
--- a/hackathon/go.sum
+++ b/hackathon/go.sum
--- a/hackathon/mentions.parquet
+++ b/hackathon/mentions.parquet
--- a/hackathon/sentiment.py
+++ b/hackathon/sentiment.py
@ -0,0 +1,92 @@
+import pyarrow.parquet as pq
+from datetime import datetime, date, timedelta
+from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
+import pandas as pd
+import numpy as np
+import subprocess
+
+def run_go_scraper(file_name: str, target_date: str, days: int, ticker: str, stock: str):
+    # This calls the Go file with arguments
+    cmd = ["go", "run", file_name, f"-start={target_date}", f"-days={days}", f"-ticker={ticker}", f"-stock={stock}"]
+    
+    print(f"Executing: {' '.join(cmd)}")
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    
+    if result.returncode == 0:
+        print("Scraper Success!")
+        print(result.stdout)
+    else:
+        print("Scraper Error:", result.stderr)
+    
+# Load model and tokenizer once at the module level to save memory/time
+MODEL_NAME = "ProsusAI/finbert"
+tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
+nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
+
+def calculate_weighted_sentiment(file_path, end_date_str, num_days):
+    """
+    Calculates the weighted sentiment score from a parquet file of news mentions.
+    """
+    label_sign = {"negative": -1, "neutral": 0, "positive": 1}
+    end_date = datetime.strptime(end_date_str, "%Y-%m-%d").date()
+    
+    # 1. Load and Filter Data
+    table = pq.read_table(file_path, columns=["title", "date"])
+    rows = table.to_pylist()
+
+    # Create date range (Oldest to newest)
+    date_list = [(end_date - timedelta(days=x)).strftime("%Y-%m-%d") for x in range(num_days)]
+    date_list.reverse()
+
+    # Group headlines by date
+    daily_headlines = {d: [] for d in date_list}
+    for row in rows:
+        if row["date"] in daily_headlines:
+            if row["title"] and row["title"].strip():
+                daily_headlines[row["date"]].append(row["title"])
+
+    # 2. Calculate Sentiment per Day
+    daily_sentiment_list = []
+    for d in date_list:
+        current_headlines = daily_headlines[d]
+        
+        if not current_headlines:
+            daily_sentiment_list.append(0.0)
+            continue
+        
+        results = nlp(current_headlines)
+        
+        day_scores = []
+        for r in results:
+            # Applying your specific negative weight multiplier
+            multiplier = 2 if r["label"] == "negative" else 1
+            score = multiplier * r["score"] * label_sign[r["label"]]
+            day_scores.append(score)
+        
+        daily_sentiment_list.append(np.mean(day_scores))
+
+    # 3. Calculate Hype and Recency
+    df = pd.read_parquet(file_path)
+    daily_counts = df.groupby('date').size()
+
+    # Hype list
+    max_val = daily_counts.max()
+    hype_list = np.exp(daily_counts - max_val)
+
+    # Recency list
+    unique_dates = pd.to_datetime(daily_counts.index)
+    most_recent_event = unique_dates.max()
+    t_days = (most_recent_event - unique_dates).to_series().dt.days
+    recency_list = (1 - np.tanh(t_days / 3)).tolist()
+
+    # 4. Final Scoring
+    weighted_final_scores = np.array(hype_list) * np.array(recency_list) * np.array(daily_sentiment_list)
+    final_scores = np.array(hype_list) * np.array(recency_list)
+
+    # Handle division by zero if lists are empty
+    if np.sum(final_scores) == 0:
+        return 0.0
+
+    total_sum = np.sum(weighted_final_scores) / np.sum(final_scores)
+    return total_sum
--- a/hackathon/tsallis.py
+++ b/hackathon/tsallis.py
@ -0,0 +1,169 @@
+import numpy as np
+import pandas as pd
+import yfinance as yf
+from dataclasses import dataclass
+from sentiment import calculate_weighted_sentiment, run_go_scraper
+from datetime import datetime, timedelta
+
+# ── Config ────────────────────────────────────────────────────────────────
+
+Q             = 0.5       # Tsallis index (q<1 amplifies fat tails)
+LAMBDA_DECAY  = 0.97      # Exponential decay for time weighting
+WINDOW        = 60        # Rolling window (bars)
+N_BINS        = 15        # Histogram bins
+BIN_SCALE     = 3.5       # Bins span ±3.5 × historical std
+MULTI_WINDOWS = (30, 60, 120)  # Multi-scale averaging
+PARK_WEIGHT   = 0.3       # 0.0 = pure Tsallis, 1.0 = pure Parkinson
+MIN_CAL       = 120       # Minimum bars before first prediction
+
+# ── Parkinson Range Volatility ────────────────────────────────────────────
+
+def _parkinson(highs: np.ndarray, lows: np.ndarray) -> np.ndarray:
+    n = len(highs)
+    out = np.full(n, np.nan)
+    safe_lows = np.where(lows > 0, lows, highs)
+    log_hl_sq = np.log(np.where(highs > safe_lows, highs / safe_lows, 1.0)) ** 2
+    for i in range(WINDOW, n):
+        out[i] = np.sqrt(np.mean(log_hl_sq[i - WINDOW:i]) / (4.0 * np.log(2.0)))
+    return out
+
+def _norm(value: float, past: np.ndarray) -> float:
+    clean = past[~np.isnan(past)]
+    if len(clean) < 20 or np.std(clean) == 0:
+        return 0.5
+    z = (value - np.mean(clean)) / np.std(clean)
+    return float(np.clip(np.tanh(z / 2.0 + 0.5), 0.0, 1.0))
+
+# ── Tsallis Entropy Core ─────────────────────────────────────────────────
+
+def _weights(T: int) -> np.ndarray:
+    return LAMBDA_DECAY ** np.arange(T, 0, -1, dtype=np.float64)
+
+def _weighted_probs(returns: np.ndarray, weights: np.ndarray, edges: np.ndarray) -> np.ndarray:
+    n_bins = len(edges) - 1
+    wp = np.zeros(n_bins, dtype=np.float64)
+    idx = np.clip(np.digitize(returns, edges) - 1, 0, n_bins - 1)
+    np.add.at(wp, idx, weights)
+    s = weights.sum()
+    if s > 0:
+        wp /= s
+    return wp
+
+def _entropy(wp: np.ndarray) -> float:
+    nz = wp[wp > 0]
+    return (1.0 - np.sum(nz ** Q)) / (Q - 1.0) if len(nz) > 0 else 0.0
+
+def _max_entropy() -> float:
+    return (N_BINS ** (1.0 - Q) - 1.0) / (1.0 - Q)
+
+def _tsallis_score(returns: np.ndarray, edges: np.ndarray, window: int) -> float:
+    eff = returns[-window:] if len(returns) >= window else returns
+    wp = _weighted_probs(eff, _weights(len(eff)), edges)
+    mx = _max_entropy()
+    return float(np.clip(_entropy(wp) / mx, 0.0, 1.0)) if mx else 0.0
+
+# ── Public API ────────────────────────────────────────────────────────────
+
+@dataclass
+class Result:
+    score: float      # Volatility/Entropy [0, 1]
+    sent: float       # News Sentiment [-1, 1]
+    trade_signal: float # sentiment * score
+    regime: str       # LOW / MODERATE / HIGH
+
+def get_score(ticker_symbol: str, stock_name: str, parquet_path: str, end_dt_str: str, days_lookback: int, volatility_time: int, interval: str = "1h") -> Result:
+    # FIX: Parse the input string correctly using the argument 'end_dt_str'
+    end_dt = datetime.strptime(end_dt_str, "%Y-%m-%d")
+    start_dt = end_dt - timedelta(days=volatility_time) 
+    
+    # 2. Fetch Price Data
+    df = yf.Ticker(ticker_symbol).history(
+        start=start_dt.strftime("%Y-%m-%d"),
+        end=end_dt.strftime("%Y-%m-%d"),
+        interval=interval,
+        auto_adjust=True
+    )
+    
+    if df.empty:
+        raise ValueError(f"No data for '{ticker_symbol}'.")
+        
+    df["log_return"] = np.log(df["Close"] / df["Close"].shift(1))
+    df = df.dropna(subset=["log_return"])
+    
+    returns = df["log_return"].values
+    highs = df["High"].values
+    lows = df["Low"].values
+
+    if len(returns) < MIN_CAL:
+        raise ValueError(f"Need {MIN_CAL} bars, got {len(returns)}.")
+
+    # 3. Calculate Tsallis Volatility
+    r = returns[:-1].astype(np.float64)
+    half = BIN_SCALE * np.std(r)
+    edges = np.linspace(np.mean(r) - half, np.mean(r) + half, N_BINS + 1)
+    ts = [_tsallis_score(returns, edges, w) for w in MULTI_WINDOWS if len(returns) >= w]
+    tsallis = float(np.mean(ts)) if ts else 0.5
+
+    # 4. Calculate Parkinson Volatility
+    park = 0.5
+    if PARK_WEIGHT > 0:
+        pv = _parkinson(highs, lows)
+        if not np.isnan(pv[-1]):
+            park = _norm(pv[-1], pv[:-1])
+
+    # 5. Final Vol Score & Sentiment Integration
+    vol_score = float(np.clip((1.0 - PARK_WEIGHT) * tsallis + PARK_WEIGHT * park, 0.0, 1.0))
+    
+    # Pass correct variables to sentiment function
+    sent = calculate_weighted_sentiment(parquet_path, end_date_str=end_dt_str, num_days=days_lookback)
+    
+    trade_signal = sent * vol_score
+    regime = "LOW" if vol_score < 0.3 else "MODERATE" if vol_score < 0.6 else "HIGH"
+    
+    return Result(
+        score=vol_score, 
+        sent=sent, 
+        trade_signal=trade_signal, 
+        regime=regime
+    )
+
+# ── Execution ─────────────────────────────────────────────────────────────
+
+if __name__ == "__main__":
+    # 1. Define targets dynamically
+    target_ticker = str(input("Stock ticker: "))      # Ticker Symbol
+    target_stock = str(input("Stock name: "))      # Search keyword for headlines
+    lookback = int(input("Sentiment lookback: "))
+    run_date = (datetime.now()).strftime("%Y-%m-%d")
+    volatility_lookback = int(input("Volatility lookback: "))
+
+    # 2. Run the updated Go Scraper (Now with 5 arguments)
+    run_go_scraper("yahooscrape.go", run_date, lookback, target_ticker, target_stock)
+
+    # 3. Run Analysis
+    try:
+        res = get_score(
+            ticker_symbol=target_ticker, 
+            stock_name=target_stock, 
+            parquet_path="mentions.parquet", 
+            end_dt_str=run_date, 
+            days_lookback=lookback,
+            volatility_time=volatility_lookback
+        )
+        
+        print(f"\n--- {target_ticker} ({res.regime} VOLATILITY) ---")
+        print(f"Vol Score:     {res.score:.4f}")
+        print(f"Sentiment:     {res.sent:.4f}")
+        print(f"Trade Signal:  {res.trade_signal:.4f}")
+        
+        # Verdict logic
+        if res.score < 0.3:
+            print("Verdict: SIT OUT (Low Activity)")
+        elif res.score > 0.6:
+            action = "BUY" if res.trade_signal > 0 else "SELL"
+            print(f"Verdict: FULL CONVICTION {action}")
+        else:
+            print("Verdict: MODERATE (Monitor/Small Position)")
+            
+    except Exception as e:
+        print(f"Pipeline Error: {e}")
--- a/hackathon/yahooscrape.go
+++ b/hackathon/yahooscrape.go
@ -0,0 +1,111 @@
+package main
+
+import (
+	"encoding/json"
+	"flag"
+	"fmt"
+	"log"
+	"net/http"
+	"strings"
+	"time"
+
+	"github.com/xitongsys/parquet-go-source/local"
+	"github.com/xitongsys/parquet-go/writer"
+)
+
+type FinnhubNews struct {
+	Datetime int64  `json:"datetime"`
+	Headline string `json:"headline"`
+	Source   string `json:"source"`
+}
+
+type NewsRecord struct {
+	Title  string `parquet:"name=title, type=BYTE_ARRAY, convertedtype=UTF8, encoding=PLAIN_DICTIONARY"`
+	Source string `parquet:"name=source, type=BYTE_ARRAY, convertedtype=UTF8, encoding=PLAIN_DICTIONARY"`
+	Date   string `parquet:"name=date, type=BYTE_ARRAY, convertedtype=UTF8, encoding=PLAIN_DICTIONARY"`
+}
+
+func main() {
+	// 1. Define flags for input
+	startDatePtr := flag.String("start", "Na", "Start date in YYYY-MM-DD format")
+	daysPtr := flag.Int("days", 0, "Number of days to look back")
+
+	tickerPtr := flag.String("ticker", "", "Stock Ticker")
+	stockPtr := flag.String("stock", "", "Stock Name")
+
+	flag.Parse()
+
+	apiKey := "d6hkh6hr01qr5k4cb93gd6hkh6hr01qr5k4cb940"
+
+	// 2. Parse the input start date
+	baseDate, err := time.Parse("2006-01-02", *startDatePtr)
+	if err != nil {
+		log.Fatalf("Invalid date format. Use YYYY-MM-DD: %v", err)
+	}
+
+	fw, err := local.NewLocalFileWriter("mentions.parquet")
+	if err != nil {
+		log.Fatal(err)
+	}
+	pw, err := writer.NewParquetWriter(fw, new(NewsRecord), 4)
+	if err != nil {
+		log.Fatal(err)
+	}
+
+	for i := 0; i < *daysPtr; i++ {
+		// 3. Subtract days from the PROVIDED start date instead of 'Now'
+		currentDate := baseDate.AddDate(0, 0, -i)
+		dateStr := currentDate.Format("2006-01-02")
+
+		fmt.Printf("Checking %s... ", dateStr)
+
+		// Updated symbol parameter to 'ticker'
+		url := fmt.Sprintf(
+			"https://finnhub.io/api/v1/company-news?symbol=%s&from=%s&to=%s&token=%s",
+			*tickerPtr, dateStr, dateStr, apiKey,
+		)
+
+		resp, err := http.Get(url)
+		if err != nil {
+			fmt.Printf("Network Error: %v\n", err)
+			continue
+		}
+
+		var news []FinnhubNews
+		if err := json.NewDecoder(resp.Body).Decode(&news); err != nil {
+			resp.Body.Close()
+			fmt.Printf("Decode Error\n")
+			continue
+		}
+		resp.Body.Close()
+
+		matchCount := 0
+		for _, article := range news {
+			h := strings.ToLower(article.Headline)
+
+			// Dynamically check for the stock name or ticker passed in via flags
+			if strings.Contains(h, strings.ToLower(*stockPtr)) || strings.Contains(h, strings.ToLower(*tickerPtr)) {
+				record := NewsRecord{
+					Title:  article.Headline,
+					Source: article.Source,
+					Date:   dateStr,
+				}
+				pw.Write(record)
+				matchCount++
+			}
+		}
+
+		if matchCount == 0 {
+			fmt.Print("No matches. Writing placeholder.")
+			pw.Write(NewsRecord{Title: "", Source: "NONE", Date: dateStr})
+		} else {
+			fmt.Printf("Found %d matches.", matchCount)
+		}
+		fmt.Println()
+		time.Sleep(1 * time.Second)
+	}
+
+	pw.WriteStop()
+	fw.Close()
+	fmt.Println("\nFinished! Check mentions.parquet")
+}