Files
MoFin/venv/lib/python3.12/site-packages/yfinance/scrapers/fundamentals.py
T
知微 fa45d8aa5f fix: 小果地址统一node122(兼容LAN+EasyTier)
- health_checklist.json: 192.168.1.122→node122
- ocr_client.py: docstring IP→node122
- docs/market-data-requirements.md: IP→node122
- 所有API调用通过ProxyHandler({})绕过系统代理
  Privoxy对node122:18003返回500,直连正常
2026-06-30 02:56:35 +08:00

205 lines
8.2 KiB
Python

import datetime
import json
import warnings
import pandas as pd
from yfinance import utils, const
from yfinance.config import YfConfig
from yfinance.data import YfData
from yfinance.exceptions import YFException, YFNotImplementedError
class Fundamentals:
def __init__(self, data: YfData, symbol: str):
self._data = data
self._symbol = symbol
self._earnings = None
self._financials = None
self._shares = None
self._financials_data = None
self._fin_data_quote = None
self._basics_already_scraped = False
self._financials = Financials(data, symbol)
@property
def financials(self) -> "Financials":
return self._financials
@property
def earnings(self) -> dict:
warnings.warn("'Ticker.earnings' is deprecated as not available via API. Look for \"Net Income\" in Ticker.income_stmt.", DeprecationWarning)
return None
@property
def shares(self) -> pd.DataFrame:
if self._shares is None:
raise YFNotImplementedError('shares')
return self._shares
class Financials:
def __init__(self, data: YfData, symbol: str):
self._data = data
self._symbol = symbol
self._income_time_series = {}
self._balance_sheet_time_series = {}
self._cash_flow_time_series = {}
def get_income_time_series(self, freq="yearly") -> pd.DataFrame:
res = self._income_time_series
if freq not in res:
res[freq] = self._fetch_time_series("income", freq)
return res[freq]
def get_balance_sheet_time_series(self, freq="yearly") -> pd.DataFrame:
res = self._balance_sheet_time_series
if freq not in res:
res[freq] = self._fetch_time_series("balance-sheet", freq)
return res[freq]
def get_cash_flow_time_series(self, freq="yearly") -> pd.DataFrame:
res = self._cash_flow_time_series
if freq not in res:
res[freq] = self._fetch_time_series("cash-flow", freq)
return res[freq]
@utils.log_indent_decorator
def _fetch_time_series(self, name, timescale):
# Fetching time series preferred over scraping 'QuoteSummaryStore',
# because it matches what Yahoo shows. But for some tickers returns nothing,
# despite 'QuoteSummaryStore' containing valid data.
allowed_names = ["income", "balance-sheet", "cash-flow"]
allowed_timescales = ["yearly", "quarterly", "trailing"]
if name not in allowed_names:
raise ValueError(f"Illegal argument: name must be one of: {allowed_names}")
if timescale not in allowed_timescales:
raise ValueError(f"Illegal argument: timescale must be one of: {allowed_timescales}")
if timescale == "trailing" and name not in ('income', 'cash-flow'):
raise ValueError("Illegal argument: frequency 'trailing'" +
" only available for cash-flow or income data.")
try:
statement = self._create_financials_table(name, timescale)
if statement is not None:
return statement
except YFException as e:
if not YfConfig.debug.hide_exceptions:
raise
utils.get_yf_logger().error(f"{self._symbol}: Failed to create {name} financials table for reason: {e}")
return pd.DataFrame()
def _create_financials_table(self, name, timescale):
if name == "income":
# Yahoo stores the 'income' table internally under 'financials' key
name = "financials"
keys = const.fundamentals_keys[name]
try:
return self._get_financials_time_series(timescale, keys)
except Exception:
if not YfConfig.debug.hide_exceptions:
raise
pass
# Tuned so the resulting URL stays under ~2KB even for the longest "annual"
# prefix, which keeps it below the practical limits of typical NAT / proxy
# paths (notably WSL2, which silently drops the long single-shot URL).
_CHUNK_KEYS = 60
def _get_financials_time_series(self, timescale, keys: list) -> pd.DataFrame:
timescale_translation = {"yearly": "annual", "quarterly": "quarterly", "trailing": "trailing"}
timescale = timescale_translation[timescale]
# Yahoo returns maximum 4 years or 5 quarters, regardless of start_dt:
start_dt = datetime.datetime(2016, 12, 31)
end = pd.Timestamp.now('UTC').ceil("D")
period_qs = f"&period1={int(start_dt.timestamp())}&period2={int(end.timestamp())}"
ts_url_base = f"https://query2.finance.yahoo.com/ws/fundamentals-timeseries/v1/finance/timeseries/{self._symbol}?symbol={self._symbol}"
full_url = ts_url_base + "&type=" + ",".join([timescale + k for k in keys]) + period_qs
# Fast path: single long URL. Falls back to chunked requests if it
# fails (silent drop on WSL2 NAT / restrictive proxies). Sticky so a
# loop over tickers doesn't eat one timeout per ticker. If the chunked
# fallback also fails, URL length isn't the problem — revert the flag
# and re-raise so the next call retries the fast path.
if self._data.fundamentals_use_chunked:
data_raw = self._fetch_fundamentals_chunked(ts_url_base, timescale, keys, period_qs)
else:
try:
data_raw = self._fetch_fundamentals_payload(full_url)
except Exception as e:
utils.get_yf_logger().debug(
f"{self._symbol}: single-URL fundamentals fetch failed ({type(e).__name__}); "
f"falling back to chunked requests for this and subsequent fetches"
)
self._data.fundamentals_use_chunked = True
try:
data_raw = self._fetch_fundamentals_chunked(ts_url_base, timescale, keys, period_qs)
except Exception:
self._data.fundamentals_use_chunked = False
raise
for d in data_raw:
d.pop("meta", None)
# Now reshape data into a table:
# Step 1: get columns and index:
timestamps = set()
data_unpacked = {}
for x in data_raw:
for k in x.keys():
if k == "timestamp":
timestamps.update(x[k])
else:
data_unpacked[k] = x[k]
timestamps = sorted(list(timestamps))
dates = pd.to_datetime(timestamps, unit="s")
df = pd.DataFrame(columns=dates, index=list(data_unpacked.keys()))
for k, v in data_unpacked.items():
if df is None:
df = pd.DataFrame(columns=dates, index=[k])
df.loc[k] = {pd.Timestamp(x["asOfDate"]): x["reportedValue"]["raw"] for x in v}
df.index = df.index.str.replace("^" + timescale, "", regex=True)
# Ensure float type, not object
for d in df.columns:
df[d] = df[d].astype('float')
# Reorder table to match order on Yahoo website
df = df.reindex([k for k in keys if k in df.index])
df = df[sorted(df.columns, reverse=True)]
# Trailing 12 months return only the first column.
if (timescale == "trailing"):
df = df.iloc[:, [0]]
return df
def _fetch_fundamentals_chunked(self, ts_url_base: str, timescale: str, keys: list, period_qs: str) -> list:
data_raw: list = []
for i in range(0, len(keys), self._CHUNK_KEYS):
chunk = keys[i:i + self._CHUNK_KEYS]
chunk_url = ts_url_base + "&type=" + ",".join([timescale + k for k in chunk]) + period_qs
data_raw.extend(self._fetch_fundamentals_payload(chunk_url))
return data_raw
def _fetch_fundamentals_payload(self, url: str) -> list:
"""Fetch a fundamentals-timeseries URL and return the parsed `result`
list. Raises if Yahoo returns an empty / error payload (callers can
catch and fall back to chunked requests)."""
json_str = self._data.cache_get(url=url).text
json_data = json.loads(json_str)
result = (json_data.get("timeseries") or {}).get("result")
if not result:
raise YFException("Empty fundamentals-timeseries result")
return result