fa45d8aa5f
- health_checklist.json: 192.168.1.122→node122
- ocr_client.py: docstring IP→node122
- docs/market-data-requirements.md: IP→node122
- 所有API调用通过ProxyHandler({})绕过系统代理
Privoxy对node122:18003返回500,直连正常
246 lines
8.2 KiB
Python
246 lines
8.2 KiB
Python
# -*- coding: utf-8 -*-
|
|
"""US equity snapshot via yfinance.
|
|
|
|
Pluggable adapter for AlphaSift's L1 pipeline. Fetches a configurable
|
|
equity universe and returns the standard snapshot DataFrame schema.
|
|
|
|
HK is not supported yet: there is no HK universe source or ticker
|
|
configuration path, so ``market="hk"`` is rejected at the pipeline level
|
|
rather than silently screening the US pool.
|
|
"""
|
|
|
|
import logging
|
|
import os
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
import pandas as pd
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
_SP500_WIKI_URL = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
|
|
|
|
_DEFAULT_US_UNIVERSE = [
|
|
"AAPL", "MSFT", "NVDA", "AMZN", "GOOGL", "META", "TSLA", "BRK-B",
|
|
"AVGO", "JPM", "LLY", "V", "MA", "UNH", "XOM", "COST", "HD", "PG",
|
|
"JNJ", "ABBV", "WMT", "NFLX", "BAC", "KO", "CRM", "CVX", "MRK",
|
|
"PEP", "AMD", "TMO", "LIN", "ACN", "CSCO", "MCD", "ABT", "ADBE",
|
|
"WFC", "GE", "DHR", "TXN", "PM", "ISRG", "MS", "NEE", "INTU",
|
|
"DIS", "QCOM", "CAT", "NOW",
|
|
]
|
|
|
|
|
|
def fetch_us_universe(source: str = "auto") -> list[str]:
|
|
"""Return a list of US equity tickers.
|
|
|
|
Sources:
|
|
sp500 — scrape S&P 500 from Wikipedia
|
|
env — read ALPHASIFT_US_TICKERS (comma-separated)
|
|
default — hardcoded top-50 US large-caps
|
|
auto — try sp500 → env → default
|
|
"""
|
|
src = source.lower()
|
|
if src == "auto":
|
|
for s in ("sp500", "env", "default"):
|
|
try:
|
|
tickers = fetch_us_universe(s)
|
|
if tickers:
|
|
logger.info("US universe from %s: %d tickers", s, len(tickers))
|
|
return tickers
|
|
except Exception as e:
|
|
logger.debug("US universe source %s failed: %s", s, e)
|
|
return list(_DEFAULT_US_UNIVERSE)
|
|
|
|
if src == "sp500":
|
|
return _fetch_sp500_tickers()
|
|
elif src == "env":
|
|
raw = os.getenv("ALPHASIFT_US_TICKERS", "").strip()
|
|
if not raw:
|
|
raise ValueError("ALPHASIFT_US_TICKERS not set")
|
|
return [t.strip() for t in raw.split(",") if t.strip()]
|
|
elif src == "default":
|
|
return list(_DEFAULT_US_UNIVERSE)
|
|
else:
|
|
raise ValueError(f"Unknown US universe source: {source}")
|
|
|
|
|
|
def _fetch_sp500_tickers() -> list[str]:
|
|
tables = pd.read_html(_SP500_WIKI_URL)
|
|
for tbl in tables:
|
|
if "Symbol" in tbl.columns:
|
|
return sorted(tbl["Symbol"].dropna().str.strip().str.replace(".", "-", regex=False).tolist())
|
|
raise RuntimeError("Could not find Symbol column in S&P 500 Wikipedia table")
|
|
|
|
|
|
def fetch_us_snapshot(
|
|
tickers: list[str] | None = None,
|
|
*,
|
|
universe_source: str = "auto",
|
|
max_workers: int = 8,
|
|
) -> pd.DataFrame:
|
|
"""Fetch US equity snapshot in AlphaSift standard schema.
|
|
|
|
Uses yfinance to fetch current data for each ticker. Returns a
|
|
DataFrame matching the standard snapshot columns: code, name, price,
|
|
change_pct, amount, total_mv, pe_ratio, pb_ratio, volume_ratio,
|
|
turnover_rate, industry.
|
|
"""
|
|
import yfinance as yf
|
|
|
|
if tickers is None:
|
|
tickers = fetch_us_universe(universe_source)
|
|
|
|
logger.info("Fetching US snapshot for %d tickers", len(tickers))
|
|
|
|
hist_end = pd.Timestamp.now().normalize()
|
|
hist_start = hist_end - pd.Timedelta(days=30)
|
|
data = yf.download(
|
|
tickers,
|
|
start=hist_start.strftime("%Y-%m-%d"),
|
|
end=hist_end.strftime("%Y-%m-%d"),
|
|
group_by="ticker",
|
|
auto_adjust=True,
|
|
progress=False,
|
|
threads=True,
|
|
)
|
|
|
|
rows = []
|
|
|
|
def _process_ticker(ticker: str) -> dict | None:
|
|
try:
|
|
if len(tickers) == 1:
|
|
hist = data.copy()
|
|
if isinstance(hist.columns, pd.MultiIndex):
|
|
hist.columns = hist.columns.droplevel("Ticker")
|
|
else:
|
|
if ticker not in data.columns.get_level_values(0):
|
|
return None
|
|
hist = data[ticker].copy()
|
|
if hist.empty:
|
|
return None
|
|
|
|
hist = hist[hist["Close"].notna()]
|
|
if len(hist) < 2:
|
|
return None
|
|
|
|
latest = hist.iloc[-1]
|
|
prev = hist.iloc[-2]
|
|
price = float(latest["Close"])
|
|
prev_close = float(prev["Close"])
|
|
volume = float(latest["Volume"])
|
|
change_pct = ((price - prev_close) / prev_close * 100) if prev_close > 0 else 0.0
|
|
|
|
vol_20d = float(hist["Volume"].tail(20).mean())
|
|
volume_ratio = (volume / vol_20d) if vol_20d > 0 else 1.0
|
|
|
|
info = yf.Ticker(ticker).fast_info
|
|
market_cap = getattr(info, "market_cap", None) or 0
|
|
shares = getattr(info, "shares", None) or 0
|
|
turnover_rate = (volume / shares * 100) if shares > 0 else 0.0
|
|
|
|
return {
|
|
"code": ticker,
|
|
"name": ticker,
|
|
"price": price,
|
|
"change_pct": round(change_pct, 2),
|
|
"amount": round(volume * price, 0),
|
|
"total_mv": market_cap,
|
|
"circ_mv": market_cap,
|
|
"pe_ratio": None,
|
|
"pb_ratio": None,
|
|
"volume_ratio": round(volume_ratio, 2),
|
|
"turnover_rate": round(turnover_rate, 4),
|
|
"industry": "",
|
|
}
|
|
except Exception as e:
|
|
logger.debug("Failed to process %s: %s", ticker, e)
|
|
return None
|
|
|
|
with ThreadPoolExecutor(max_workers=max_workers) as pool:
|
|
futures = {pool.submit(_process_ticker, t): t for t in tickers}
|
|
for future in as_completed(futures):
|
|
result = future.result()
|
|
if result:
|
|
rows.append(result)
|
|
|
|
if not rows:
|
|
raise RuntimeError("yfinance returned no valid data for any ticker")
|
|
|
|
df = pd.DataFrame(rows)
|
|
|
|
numeric_cols = [
|
|
"price", "change_pct", "amount", "total_mv", "circ_mv",
|
|
"pe_ratio", "pb_ratio", "volume_ratio", "turnover_rate",
|
|
]
|
|
for col in numeric_cols:
|
|
if col in df.columns:
|
|
df[col] = pd.to_numeric(df[col], errors="coerce")
|
|
|
|
df = df.dropna(subset=["price"])
|
|
df = df[df["price"] > 0]
|
|
|
|
_enrich_info_fields(df)
|
|
|
|
df.attrs["snapshot_source"] = "yfinance"
|
|
logger.info("US snapshot: %d rows from yfinance", len(df))
|
|
return df
|
|
|
|
|
|
def _enrich_info_fields(df: pd.DataFrame) -> None:
|
|
"""Best-effort enrichment of pe_ratio, pb_ratio, industry from yfinance info."""
|
|
import yfinance as yf
|
|
|
|
needs_pe = df["pe_ratio"].isna().sum() > len(df) * 0.5
|
|
if not needs_pe:
|
|
return
|
|
|
|
for idx in df.index:
|
|
ticker = df.at[idx, "code"]
|
|
try:
|
|
info = yf.Ticker(ticker).info
|
|
if pd.isna(df.at[idx, "pe_ratio"]) or df.at[idx, "pe_ratio"] == 0:
|
|
df.at[idx, "pe_ratio"] = info.get("trailingPE")
|
|
if pd.isna(df.at[idx, "pb_ratio"]) or df.at[idx, "pb_ratio"] == 0:
|
|
df.at[idx, "pb_ratio"] = info.get("priceToBook")
|
|
if not df.at[idx, "industry"]:
|
|
df.at[idx, "industry"] = info.get("industry", "")
|
|
if not df.at[idx, "name"] or df.at[idx, "name"] == ticker:
|
|
df.at[idx, "name"] = info.get("shortName", ticker)
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
def fetch_daily_history_yfinance(
|
|
ticker: str,
|
|
*,
|
|
lookback_days: int = 120,
|
|
) -> pd.DataFrame:
|
|
"""Fetch daily OHLCV history for a US ticker via yfinance.
|
|
|
|
Returns a DataFrame with columns: date, open, high, low, close, volume
|
|
matching the schema expected by alphasift.daily's enrichment logic.
|
|
"""
|
|
import yfinance as yf
|
|
|
|
end = pd.Timestamp.now().normalize()
|
|
start = end - pd.Timedelta(days=max(lookback_days * 2, 180))
|
|
hist = yf.download(
|
|
ticker,
|
|
start=start.strftime("%Y-%m-%d"),
|
|
end=end.strftime("%Y-%m-%d"),
|
|
auto_adjust=True,
|
|
progress=False,
|
|
)
|
|
if hist is None or hist.empty:
|
|
raise RuntimeError(f"yfinance daily history empty for {ticker}")
|
|
|
|
if isinstance(hist.columns, pd.MultiIndex):
|
|
hist.columns = hist.columns.droplevel("Ticker")
|
|
|
|
hist = hist.tail(max(lookback_days, 30)).copy()
|
|
hist = hist.rename(columns={
|
|
"Open": "开盘", "High": "最高", "Low": "最低",
|
|
"Close": "收盘", "Volume": "成交量",
|
|
})
|
|
hist.index.name = "日期"
|
|
hist = hist.reset_index()
|
|
return hist
|