Files
MoFin/venv/lib/python3.12/site-packages/alphasift/snapshot_us.py
T
知微 fa45d8aa5f fix: 小果地址统一node122(兼容LAN+EasyTier)
- health_checklist.json: 192.168.1.122→node122
- ocr_client.py: docstring IP→node122
- docs/market-data-requirements.md: IP→node122
- 所有API调用通过ProxyHandler({})绕过系统代理
  Privoxy对node122:18003返回500,直连正常
2026-06-30 02:56:35 +08:00

246 lines
8.2 KiB
Python

# -*- coding: utf-8 -*-
"""US equity snapshot via yfinance.
Pluggable adapter for AlphaSift's L1 pipeline. Fetches a configurable
equity universe and returns the standard snapshot DataFrame schema.
HK is not supported yet: there is no HK universe source or ticker
configuration path, so ``market="hk"`` is rejected at the pipeline level
rather than silently screening the US pool.
"""
import logging
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
import pandas as pd
logger = logging.getLogger(__name__)
_SP500_WIKI_URL = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
_DEFAULT_US_UNIVERSE = [
"AAPL", "MSFT", "NVDA", "AMZN", "GOOGL", "META", "TSLA", "BRK-B",
"AVGO", "JPM", "LLY", "V", "MA", "UNH", "XOM", "COST", "HD", "PG",
"JNJ", "ABBV", "WMT", "NFLX", "BAC", "KO", "CRM", "CVX", "MRK",
"PEP", "AMD", "TMO", "LIN", "ACN", "CSCO", "MCD", "ABT", "ADBE",
"WFC", "GE", "DHR", "TXN", "PM", "ISRG", "MS", "NEE", "INTU",
"DIS", "QCOM", "CAT", "NOW",
]
def fetch_us_universe(source: str = "auto") -> list[str]:
"""Return a list of US equity tickers.
Sources:
sp500 — scrape S&P 500 from Wikipedia
env — read ALPHASIFT_US_TICKERS (comma-separated)
default — hardcoded top-50 US large-caps
auto — try sp500 → env → default
"""
src = source.lower()
if src == "auto":
for s in ("sp500", "env", "default"):
try:
tickers = fetch_us_universe(s)
if tickers:
logger.info("US universe from %s: %d tickers", s, len(tickers))
return tickers
except Exception as e:
logger.debug("US universe source %s failed: %s", s, e)
return list(_DEFAULT_US_UNIVERSE)
if src == "sp500":
return _fetch_sp500_tickers()
elif src == "env":
raw = os.getenv("ALPHASIFT_US_TICKERS", "").strip()
if not raw:
raise ValueError("ALPHASIFT_US_TICKERS not set")
return [t.strip() for t in raw.split(",") if t.strip()]
elif src == "default":
return list(_DEFAULT_US_UNIVERSE)
else:
raise ValueError(f"Unknown US universe source: {source}")
def _fetch_sp500_tickers() -> list[str]:
tables = pd.read_html(_SP500_WIKI_URL)
for tbl in tables:
if "Symbol" in tbl.columns:
return sorted(tbl["Symbol"].dropna().str.strip().str.replace(".", "-", regex=False).tolist())
raise RuntimeError("Could not find Symbol column in S&P 500 Wikipedia table")
def fetch_us_snapshot(
tickers: list[str] | None = None,
*,
universe_source: str = "auto",
max_workers: int = 8,
) -> pd.DataFrame:
"""Fetch US equity snapshot in AlphaSift standard schema.
Uses yfinance to fetch current data for each ticker. Returns a
DataFrame matching the standard snapshot columns: code, name, price,
change_pct, amount, total_mv, pe_ratio, pb_ratio, volume_ratio,
turnover_rate, industry.
"""
import yfinance as yf
if tickers is None:
tickers = fetch_us_universe(universe_source)
logger.info("Fetching US snapshot for %d tickers", len(tickers))
hist_end = pd.Timestamp.now().normalize()
hist_start = hist_end - pd.Timedelta(days=30)
data = yf.download(
tickers,
start=hist_start.strftime("%Y-%m-%d"),
end=hist_end.strftime("%Y-%m-%d"),
group_by="ticker",
auto_adjust=True,
progress=False,
threads=True,
)
rows = []
def _process_ticker(ticker: str) -> dict | None:
try:
if len(tickers) == 1:
hist = data.copy()
if isinstance(hist.columns, pd.MultiIndex):
hist.columns = hist.columns.droplevel("Ticker")
else:
if ticker not in data.columns.get_level_values(0):
return None
hist = data[ticker].copy()
if hist.empty:
return None
hist = hist[hist["Close"].notna()]
if len(hist) < 2:
return None
latest = hist.iloc[-1]
prev = hist.iloc[-2]
price = float(latest["Close"])
prev_close = float(prev["Close"])
volume = float(latest["Volume"])
change_pct = ((price - prev_close) / prev_close * 100) if prev_close > 0 else 0.0
vol_20d = float(hist["Volume"].tail(20).mean())
volume_ratio = (volume / vol_20d) if vol_20d > 0 else 1.0
info = yf.Ticker(ticker).fast_info
market_cap = getattr(info, "market_cap", None) or 0
shares = getattr(info, "shares", None) or 0
turnover_rate = (volume / shares * 100) if shares > 0 else 0.0
return {
"code": ticker,
"name": ticker,
"price": price,
"change_pct": round(change_pct, 2),
"amount": round(volume * price, 0),
"total_mv": market_cap,
"circ_mv": market_cap,
"pe_ratio": None,
"pb_ratio": None,
"volume_ratio": round(volume_ratio, 2),
"turnover_rate": round(turnover_rate, 4),
"industry": "",
}
except Exception as e:
logger.debug("Failed to process %s: %s", ticker, e)
return None
with ThreadPoolExecutor(max_workers=max_workers) as pool:
futures = {pool.submit(_process_ticker, t): t for t in tickers}
for future in as_completed(futures):
result = future.result()
if result:
rows.append(result)
if not rows:
raise RuntimeError("yfinance returned no valid data for any ticker")
df = pd.DataFrame(rows)
numeric_cols = [
"price", "change_pct", "amount", "total_mv", "circ_mv",
"pe_ratio", "pb_ratio", "volume_ratio", "turnover_rate",
]
for col in numeric_cols:
if col in df.columns:
df[col] = pd.to_numeric(df[col], errors="coerce")
df = df.dropna(subset=["price"])
df = df[df["price"] > 0]
_enrich_info_fields(df)
df.attrs["snapshot_source"] = "yfinance"
logger.info("US snapshot: %d rows from yfinance", len(df))
return df
def _enrich_info_fields(df: pd.DataFrame) -> None:
"""Best-effort enrichment of pe_ratio, pb_ratio, industry from yfinance info."""
import yfinance as yf
needs_pe = df["pe_ratio"].isna().sum() > len(df) * 0.5
if not needs_pe:
return
for idx in df.index:
ticker = df.at[idx, "code"]
try:
info = yf.Ticker(ticker).info
if pd.isna(df.at[idx, "pe_ratio"]) or df.at[idx, "pe_ratio"] == 0:
df.at[idx, "pe_ratio"] = info.get("trailingPE")
if pd.isna(df.at[idx, "pb_ratio"]) or df.at[idx, "pb_ratio"] == 0:
df.at[idx, "pb_ratio"] = info.get("priceToBook")
if not df.at[idx, "industry"]:
df.at[idx, "industry"] = info.get("industry", "")
if not df.at[idx, "name"] or df.at[idx, "name"] == ticker:
df.at[idx, "name"] = info.get("shortName", ticker)
except Exception:
pass
def fetch_daily_history_yfinance(
ticker: str,
*,
lookback_days: int = 120,
) -> pd.DataFrame:
"""Fetch daily OHLCV history for a US ticker via yfinance.
Returns a DataFrame with columns: date, open, high, low, close, volume
matching the schema expected by alphasift.daily's enrichment logic.
"""
import yfinance as yf
end = pd.Timestamp.now().normalize()
start = end - pd.Timedelta(days=max(lookback_days * 2, 180))
hist = yf.download(
ticker,
start=start.strftime("%Y-%m-%d"),
end=end.strftime("%Y-%m-%d"),
auto_adjust=True,
progress=False,
)
if hist is None or hist.empty:
raise RuntimeError(f"yfinance daily history empty for {ticker}")
if isinstance(hist.columns, pd.MultiIndex):
hist.columns = hist.columns.droplevel("Ticker")
hist = hist.tail(max(lookback_days, 30)).copy()
hist = hist.rename(columns={
"Open": "开盘", "High": "最高", "Low": "最低",
"Close": "收盘", "Volume": "成交量",
})
hist.index.name = "日期"
hist = hist.reset_index()
return hist