Files
MoFin/venv/lib/python3.12/site-packages/alphasift/daily.py
T
知微 fa45d8aa5f fix: 小果地址统一node122(兼容LAN+EasyTier)
- health_checklist.json: 192.168.1.122→node122
- ocr_client.py: docstring IP→node122
- docs/market-data-requirements.md: IP→node122
- 所有API调用通过ProxyHandler({})绕过系统代理
  Privoxy对node122:18003返回500,直连正常
2026-06-30 02:56:35 +08:00

1172 lines
42 KiB
Python

# -*- coding: utf-8 -*-
"""Lightweight daily K-line enrichment for narrowed candidate pools."""
from __future__ import annotations
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime, timedelta
import hashlib
import json
import os
from pathlib import Path
import threading
import time
import pandas as pd
import requests
_DAILY_FEATURE_DEFAULTS = {
"daily_data_points": pd.NA,
"change_60d": pd.NA,
"ma5": pd.NA,
"ma20": pd.NA,
"ma60": pd.NA,
"ma_bullish": pd.NA,
"price_above_ma20": pd.NA,
"macd_status": "",
"rsi_status": "",
"rsi14": pd.NA,
"signal_score": pd.NA,
"prev_high_20d": pd.NA,
"range_20d_pct": pd.NA,
"breakout_20d_pct": pd.NA,
"volume_ratio_20d": pd.NA,
"body_pct": pd.NA,
"pullback_to_ma20_pct": pd.NA,
"consolidation_days_20d": pd.NA,
"volatility_20d_pct": pd.NA,
"max_drawdown_20d_pct": pd.NA,
"atr_20_pct": pd.NA,
"daily_quality_score": pd.NA,
"daily_quality_flags": "",
"daily_source": "",
}
_DAILY_ENRICH_MAX_WORKERS = 1
_DAILY_HISTORY_CACHE_VERSION = 1
_DAILY_HISTORY_CACHE_TTL_SECONDS = 24 * 60 * 60
_SOURCE_HEALTH_FAILURE_THRESHOLD = 3
_SOURCE_HEALTH_COOLDOWN_SECONDS = 5 * 60
_DEFAULT_TUSHARE_HTTP_URL = "http://api.waditu.com"
_BAOSTOCK_LOCK = threading.Lock()
_BAOSTOCK_OUTAGE_ERROR: str | None = None
_SOURCE_HEALTH: dict[str, dict[str, float]] = {}
_SOURCE_HEALTH_LOCK = threading.Lock()
def enrich_daily_features(
df: pd.DataFrame,
*,
max_rows: int = 100,
lookback_days: int = 120,
source: str = "akshare",
fetch_retries: int = 2,
cache_dir: str | Path | None = None,
cache_ttl_seconds: float | None = None,
max_workers: int | None = None,
) -> pd.DataFrame:
"""Attach daily technical features to the first ``max_rows`` candidates.
This intentionally runs after broad snapshot filtering; it is not a full
market historical-data pass.
"""
if df.empty or max_rows <= 0:
return df.copy()
result = df.copy()
daily_errors: list[str] = []
daily_source_counts: dict[str, int] = {}
daily_quality_flag_counts: dict[str, int] = {}
daily_source_order_notes: list[str] = []
daily_source_health: dict[str, object] = {}
success_count = 0
selected_index = list(result.index[:max_rows])
fetch_requests: list[tuple[object, str]] = []
for idx in selected_index:
raw_code = str(result.at[idx, "code"] if "code" in result.columns else "").strip()
if not raw_code:
continue
code = raw_code.zfill(6) if raw_code.isdigit() else raw_code
fetch_requests.append((idx, code))
def fetch_one(request: tuple[object, str]) -> tuple[object, dict[str, object], str | None, dict[str, object]]:
idx, code = request
try:
hist = fetch_daily_history(
code,
lookback_days=lookback_days,
source=source,
retries=fetch_retries,
cache_dir=cache_dir,
cache_ttl_seconds=cache_ttl_seconds,
)
features = compute_daily_features(hist)
features["daily_source"] = str(hist.attrs.get("daily_source", ""))
metadata = {
"daily_source": features["daily_source"],
"daily_quality_flags": features.get("daily_quality_flags", ""),
"daily_source_order_notes": list(hist.attrs.get("daily_source_order_notes", []) or []),
"daily_source_health": dict(hist.attrs.get("daily_source_health", {}) or {}),
}
return idx, features, None, metadata
except Exception as exc:
features = dict(_DAILY_FEATURE_DEFAULTS)
features["daily_quality_score"] = 0.0
features["daily_quality_flags"] = "fetch_failed"
return idx, features, f"{code}: {exc}", {"daily_quality_flags": "fetch_failed"}
if len(fetch_requests) <= 1:
fetched_rows = [fetch_one(request) for request in fetch_requests]
else:
worker_limit = min(_normalize_max_workers(max_workers), len(fetch_requests))
with ThreadPoolExecutor(max_workers=worker_limit) as executor:
fetched_rows = list(executor.map(fetch_one, fetch_requests))
for idx, features, error, metadata in fetched_rows:
for flag in str(metadata.get("daily_quality_flags") or "").split(";"):
if flag:
daily_quality_flag_counts[flag] = daily_quality_flag_counts.get(flag, 0) + 1
if error:
daily_errors.append(error)
else:
success_count += 1
source_name = str(metadata.get("daily_source") or "unknown")
daily_source_counts[source_name] = daily_source_counts.get(source_name, 0) + 1
order_notes = metadata.get("daily_source_order_notes", [])
if not isinstance(order_notes, list):
order_notes = []
for note in order_notes:
note_text = str(note)
if note_text and note_text not in daily_source_order_notes:
daily_source_order_notes.append(note_text)
source_health = metadata.get("daily_source_health")
if isinstance(source_health, dict):
daily_source_health.update(source_health)
for key, value in features.items():
result.at[idx, key] = value
result.attrs["daily_errors"] = daily_errors
result.attrs["daily_success_count"] = success_count
result.attrs["daily_source_counts"] = daily_source_counts
result.attrs["daily_quality_flag_counts"] = daily_quality_flag_counts
result.attrs["daily_source_order_notes"] = daily_source_order_notes
result.attrs["daily_source_health"] = daily_source_health
return result
def fetch_daily_history(
code: str,
*,
lookback_days: int = 120,
source: str = "akshare",
retries: int = 2,
cache_dir: str | Path | None = None,
cache_ttl_seconds: float | None = None,
) -> pd.DataFrame:
"""Fetch daily history for one stock code.
``source`` accepts ``tencent``, ``sina``, ``akshare``, ``baostock``, ``tushare``,
``yfinance`` or ``auto``. ``auto`` prefers Tushare when a token is
configured, then Tencent's direct HTTP K-line endpoint before wrapper-based
free sources. Without a token it starts with Tencent. Sina is a second
direct HTTP K-line source before wrapper-based fallbacks. ``yfinance`` is
explicit-only (never part of ``auto``) and expects a US ticker rather than
an A-share code.
"""
normalized_code = _normalize_daily_code(code)
normalized_lookback_days = int(lookback_days)
src = _normalize_daily_source(source)
if src == "auto":
sources: tuple[str, ...] = (
("tushare", "tencent", "sina", "akshare", "baostock")
if _has_tushare_token()
else ("tencent", "sina", "akshare", "baostock")
)
sources, source_order_notes = _rank_daily_sources_by_health(sources)
elif src in ("akshare", "baostock", "tushare", "tencent", "sina", "yfinance"):
sources = (src,)
source_order_notes = []
else:
raise ValueError(f"Unsupported daily source: {source}")
cache_path = None
if cache_dir is not None:
cache_path = _daily_history_cache_path(
cache_dir,
code=normalized_code,
source=src,
lookback_days=normalized_lookback_days,
)
cached = _read_daily_history_cache(cache_path, ttl_seconds=cache_ttl_seconds)
if cached is not None:
return cached
attempts = max(int(retries), 0) + 1
errors: list[str] = []
for current in sources:
disabled_reason = _source_disabled_reason(current)
if disabled_reason:
errors.append(f"{current}: {disabled_reason}")
continue
last_error: Exception | None = None
for attempt in range(attempts):
try:
if current == "yfinance":
from alphasift.snapshot_us import fetch_daily_history_yfinance
result = fetch_daily_history_yfinance(code, lookback_days=lookback_days)
elif current == "tencent":
result = _fetch_daily_tencent(
normalized_code,
lookback_days=normalized_lookback_days,
)
elif current == "sina":
result = _fetch_daily_sina(
normalized_code,
lookback_days=normalized_lookback_days,
)
elif current == "akshare":
result = _fetch_daily_akshare(
normalized_code,
lookback_days=normalized_lookback_days,
)
elif current == "tushare":
result = _fetch_daily_tushare(
normalized_code,
lookback_days=normalized_lookback_days,
)
else:
result = _fetch_daily_baostock(
normalized_code,
lookback_days=normalized_lookback_days,
)
_record_source_success(current, rows=len(result))
result.attrs["daily_source"] = current
result.attrs["daily_requested_source"] = src
result.attrs["daily_source_order"] = list(sources)
result.attrs["daily_source_order_notes"] = list(source_order_notes)
result.attrs["source_errors"] = list(errors)
result.attrs["daily_source_health"] = _daily_source_health_snapshot(sources)
if cache_path is not None:
_write_daily_history_cache(
cache_path,
result,
code=normalized_code,
source=src,
lookback_days=normalized_lookback_days,
)
return result
except Exception as exc: # noqa: BLE001 - aggregated below
last_error = exc
if attempt >= attempts - 1:
break
time.sleep(min(0.5 * (attempt + 1), 2.0))
errors.append(f"{current} after {attempts} attempts: {last_error}")
_record_source_failure(current)
if cache_path is not None:
stale = _read_daily_history_cache(
cache_path,
ttl_seconds=cache_ttl_seconds,
allow_stale=True,
)
if stale is not None:
stale.attrs["daily_stale"] = True
stale.attrs["daily_source_order"] = list(sources)
stale.attrs["daily_source_order_notes"] = list(source_order_notes)
stale.attrs["source_errors"] = list(errors)
stale.attrs["daily_source_health"] = _daily_source_health_snapshot(sources)
return stale
raise RuntimeError(
f"daily history fetch failed for {normalized_code}: {'; '.join(errors)}"
)
def _normalize_daily_code(value: object) -> str:
text = "" if value is None else str(value).strip()
if not text or text.lower() in {"nan", "none", "<na>"}:
return ""
if text.endswith(".0") and text[:-2].isdigit():
text = text[:-2]
if text.isdigit():
return text.zfill(6)[-6:]
digits = "".join(ch for ch in text if ch.isdigit())
return digits.zfill(6)[-6:] if digits else text
def _normalize_daily_source(source: str | None) -> str:
return (source or "akshare").strip().lower()
def _normalize_max_workers(value: int | None) -> int:
if value is None:
return _DAILY_ENRICH_MAX_WORKERS
return max(1, int(value))
def _rank_daily_sources_by_health(sources: tuple[str, ...]) -> tuple[tuple[str, ...], list[str]]:
"""Move unhealthy daily sources later while preserving default order ties."""
now = time.monotonic()
with _SOURCE_HEALTH_LOCK:
health = {source: dict(_SOURCE_HEALTH.get(source, {})) for source in sources}
default_rank = {source: idx for idx, source in enumerate(sources)}
def rank_key(source: str) -> tuple[int, float, int]:
state = health.get(source, {})
disabled_until = float(state.get("disabled_until", 0.0))
disabled = disabled_until > now
failures = float(state.get("failures", 0.0))
return (1 if disabled else 0, failures, default_rank[source])
ranked = tuple(sorted(sources, key=rank_key))
if ranked == sources:
return sources, []
return ranked, [f"daily source order adjusted by health: {','.join(ranked)}"]
def _source_disabled_reason(source: str) -> str | None:
now = time.monotonic()
with _SOURCE_HEALTH_LOCK:
state = _SOURCE_HEALTH.get(source)
if not state:
return None
disabled_until = float(state.get("disabled_until", 0.0))
if disabled_until <= now:
if disabled_until:
state["disabled_until"] = 0.0
return None
return f"temporarily disabled for {disabled_until - now:.1f}s after repeated failures"
def _record_source_success(source: str, *, rows: int | None = None) -> None:
with _SOURCE_HEALTH_LOCK:
state = _SOURCE_HEALTH.setdefault(source, {"failures": 0.0, "disabled_until": 0.0})
successes = float(state.get("successes", 0.0)) + 1.0
state["successes"] = successes
state["failures"] = 0.0
state["disabled_until"] = 0.0
state["last_success_at"] = time.time()
if rows is not None:
state["last_rows"] = float(rows)
previous_avg = float(state.get("avg_rows", rows))
state["avg_rows"] = previous_avg + (float(rows) - previous_avg) / successes
def _record_source_failure(source: str) -> None:
now = time.monotonic()
with _SOURCE_HEALTH_LOCK:
state = _SOURCE_HEALTH.setdefault(source, {"failures": 0.0, "disabled_until": 0.0})
failures = float(state.get("failures", 0.0)) + 1.0
state["failures"] = failures
state["total_failures"] = float(state.get("total_failures", 0.0)) + 1.0
state["last_failure_at"] = time.time()
if failures >= _SOURCE_HEALTH_FAILURE_THRESHOLD:
state["disabled_until"] = now + _SOURCE_HEALTH_COOLDOWN_SECONDS
def daily_source_health_snapshot() -> dict[str, dict[str, float | bool]]:
"""Return a copy of in-process daily-source health statistics."""
return _daily_source_health_snapshot(tuple(_SOURCE_HEALTH))
def _daily_source_health_snapshot(sources: tuple[str, ...]) -> dict[str, dict[str, float | bool]]:
now = time.monotonic()
snapshot: dict[str, dict[str, float | bool]] = {}
with _SOURCE_HEALTH_LOCK:
for source in sources:
state = dict(_SOURCE_HEALTH.get(source, {}))
disabled_until = float(state.get("disabled_until", 0.0))
snapshot[source] = {
"successes": float(state.get("successes", 0.0)),
"failures": float(state.get("failures", 0.0)),
"total_failures": float(state.get("total_failures", 0.0)),
"last_rows": float(state.get("last_rows", 0.0)),
"avg_rows": float(state.get("avg_rows", 0.0)),
"disabled": disabled_until > now,
}
return snapshot
def _daily_history_cache_path(
cache_dir: str | Path,
*,
code: str,
source: str,
lookback_days: int,
) -> Path:
key = f"{code}|{source}|{int(lookback_days)}"
digest = hashlib.sha256(key.encode("utf-8")).hexdigest()[:16]
safe_source = "".join(ch if ch.isalnum() else "-" for ch in source).strip("-") or "source"
safe_code = "".join(ch if ch.isalnum() else "-" for ch in code).strip("-") or "code"
return Path(cache_dir) / f"{safe_code}_{safe_source}_{int(lookback_days)}_{digest}.json"
def _read_daily_history_cache(
path: Path,
*,
ttl_seconds: float | None,
allow_stale: bool = False,
) -> pd.DataFrame | None:
try:
stat = path.stat()
except FileNotFoundError:
return None
ttl = _DAILY_HISTORY_CACHE_TTL_SECONDS if ttl_seconds is None else float(ttl_seconds)
is_stale = ttl <= 0 or time.time() - stat.st_mtime > ttl
if is_stale and not allow_stale:
return None
try:
payload = json.loads(path.read_text(encoding="utf-8"))
if payload.get("version") != _DAILY_HISTORY_CACHE_VERSION:
return None
frame = payload.get("frame")
if not isinstance(frame, dict):
return None
columns = frame.get("columns")
data = frame.get("data")
if not isinstance(columns, list) or not isinstance(data, list):
return None
df = pd.DataFrame(data, columns=columns)
metadata = payload.get("metadata")
if isinstance(metadata, dict):
for key in ("daily_source", "daily_requested_source", "daily_source_order", "daily_source_order_notes", "source_errors", "daily_source_health"):
if key in metadata:
df.attrs[key] = metadata[key]
if is_stale:
df.attrs["daily_stale"] = True
return df
except Exception:
return None
def _write_daily_history_cache(
path: Path,
df: pd.DataFrame,
*,
code: str,
source: str,
lookback_days: int,
) -> None:
try:
path.parent.mkdir(parents=True, exist_ok=True)
payload = {
"version": _DAILY_HISTORY_CACHE_VERSION,
"key": {
"code": code,
"source": source,
"lookback_days": int(lookback_days),
},
"metadata": {
"daily_source": df.attrs.get("daily_source", source),
"daily_requested_source": df.attrs.get("daily_requested_source", source),
"daily_source_order": list(df.attrs.get("daily_source_order", [])),
"daily_source_order_notes": list(df.attrs.get("daily_source_order_notes", [])),
"source_errors": list(df.attrs.get("source_errors", [])),
"daily_source_health": df.attrs.get("daily_source_health", {}),
},
"created_at": datetime.now().isoformat(),
"frame": json.loads(df.to_json(orient="split", date_format="iso", force_ascii=False)),
}
tmp_path = path.with_name(f".{path.name}.{time.time_ns()}.tmp")
tmp_path.write_text(json.dumps(payload, ensure_ascii=False), encoding="utf-8")
tmp_path.replace(path)
except Exception:
return
def _fetch_daily_akshare(code: str, *, lookback_days: int) -> pd.DataFrame:
import akshare as ak
start_date = (datetime.now() - timedelta(days=max(lookback_days * 2, 90))).strftime("%Y%m%d")
end_date = datetime.now().strftime("%Y%m%d")
df = ak.stock_zh_a_hist(
symbol=str(code).zfill(6),
period="daily",
start_date=start_date,
end_date=end_date,
adjust="qfq",
)
if df is None or df.empty:
raise RuntimeError(f"akshare daily history empty for {code}")
return df.tail(max(lookback_days, 30)).copy()
def _fetch_daily_tencent(code: str, *, lookback_days: int) -> pd.DataFrame:
"""Fetch forward-adjusted daily history from Tencent's direct HTTP API.
The endpoint is the same low-friction source recommended by a-stock-data for
stable A-share market data access: no wrapper dependency, browser-like HTTP,
and much lower IP-ban risk than Eastmoney-heavy endpoints. Tencent returns
daily K-lines as rows shaped like ``date, open, close, high, low, volume``;
amount is not always present, so it is exposed as ``NA`` when absent to keep
the common daily schema stable.
"""
symbol = _to_tencent_code(code)
count = max(int(lookback_days), 30)
response = requests.get(
"https://web.ifzq.gtimg.cn/appstock/app/fqkline/get",
params={"param": f"{symbol},day,,,{count},qfq"},
headers={"User-Agent": "Mozilla/5.0"},
timeout=10,
)
response.raise_for_status()
payload = response.json()
if not isinstance(payload, dict) or payload.get("code") not in (0, "0", None):
message = payload.get("msg") if isinstance(payload, dict) else payload
raise RuntimeError(f"tencent daily API error for {code}: {message}")
data = payload.get("data") if isinstance(payload, dict) else None
stock_data = data.get(symbol) if isinstance(data, dict) else None
if not isinstance(stock_data, dict):
raise RuntimeError(f"tencent daily history missing payload for {code}")
rows = stock_data.get("qfqday") or stock_data.get("day") or []
if not isinstance(rows, list) or not rows:
raise RuntimeError(f"tencent daily history empty for {code}")
normalized_rows: list[dict[str, object]] = []
for row in rows:
if not isinstance(row, list) or len(row) < 6:
continue
normalized_rows.append({
"date": row[0],
"open": row[1],
"close": row[2],
"high": row[3],
"low": row[4],
"volume": row[5],
"amount": row[6] if len(row) > 6 else pd.NA,
})
if not normalized_rows:
raise RuntimeError(f"tencent daily history malformed for {code}")
df = pd.DataFrame(
normalized_rows,
columns=["date", "open", "close", "high", "low", "volume", "amount"],
)
for col in ("open", "close", "high", "low", "volume", "amount"):
df[col] = pd.to_numeric(df[col], errors="coerce")
return df.tail(count).copy()
def _fetch_daily_sina(code: str, *, lookback_days: int) -> pd.DataFrame:
"""Fetch unadjusted daily history from Sina's direct K-line API.
Sina provides a lightweight non-Eastmoney HTTP fallback for A-share daily
bars. It does not expose forward-adjusted prices on this endpoint, so it is
deliberately placed behind Tencent in ``auto`` but ahead of wrapper-heavy
sources that are more prone to dependency/API drift.
"""
symbol = _to_tencent_code(code)
count = max(int(lookback_days), 30)
response = requests.get(
"https://quotes.sina.cn/cn/api/openapi.php/CN_MarketDataService.getKLineData",
params={"symbol": symbol, "scale": 240, "ma": "no", "datalen": count},
headers={"User-Agent": "Mozilla/5.0"},
timeout=10,
)
response.raise_for_status()
payload = response.json()
data = payload.get("result", {}).get("data") if isinstance(payload, dict) else None
if not isinstance(data, list) or not data:
raise RuntimeError(f"sina daily history empty for {code}")
rows: list[dict[str, object]] = []
for row in data:
if not isinstance(row, dict):
continue
rows.append({
"date": row.get("day") or row.get("date"),
"open": row.get("open"),
"close": row.get("close"),
"high": row.get("high"),
"low": row.get("low"),
"volume": row.get("volume"),
"amount": row.get("amount", pd.NA),
})
if not rows:
raise RuntimeError(f"sina daily history malformed for {code}")
df = pd.DataFrame(rows, columns=["date", "open", "close", "high", "low", "volume", "amount"])
if "date" in df.columns:
df = df.sort_values("date")
for col in ("open", "close", "high", "low", "volume", "amount"):
df[col] = pd.to_numeric(df[col], errors="coerce")
return df.tail(count).copy()
def _fetch_daily_tushare(code: str, *, lookback_days: int) -> pd.DataFrame:
"""Fetch forward-adjusted daily history via Tushare Pro."""
token = _tushare_token()
if not token:
raise RuntimeError("tushare requires TUSHARE_TOKEN")
import tushare as ts
pro = ts.pro_api(token)
_configure_tushare_client(pro, token=token)
start_date = (datetime.now() - timedelta(days=max(lookback_days * 2, 90))).strftime("%Y%m%d")
end_date = datetime.now().strftime("%Y%m%d")
adj = _normalize_tushare_adj(os.getenv("TUSHARE_DAILY_ADJ", "qfq"))
ts_code = _to_tushare_code(code)
df = pro.daily(
ts_code=ts_code,
start_date=start_date,
end_date=end_date,
fields="ts_code,trade_date,open,high,low,close,vol,amount",
)
if df is None or df.empty:
raise RuntimeError(f"tushare daily history empty for {code}")
if adj is not None:
df = _apply_tushare_adjustment(
df,
pro=pro,
ts_code=ts_code,
start_date=start_date,
end_date=end_date,
adj=adj,
)
normalized = _normalize_tushare_daily_frame(df)
return normalized.tail(max(lookback_days, 30)).copy()
def _tushare_token() -> str:
return (
os.getenv("TUSHARE_TOKEN", "").strip()
or os.getenv("TUSHARE_API_TOKEN", "").strip()
)
def _has_tushare_token() -> bool:
return bool(_tushare_token())
def _configure_tushare_client(pro: object, *, token: str) -> None:
try:
setattr(pro, "_DataApi__token", token)
except Exception:
pass
http_url = (
os.getenv("TUSHARE_API_URL", "").strip()
or os.getenv("TUSHARE_HTTP_URL", "").strip()
or _DEFAULT_TUSHARE_HTTP_URL
)
try:
setattr(pro, "_DataApi__http_url", http_url)
except Exception:
pass
def _normalize_tushare_daily_frame(df: pd.DataFrame) -> pd.DataFrame:
rename_map = {
"trade_date": "date",
"vol": "volume",
}
normalized = df.rename(columns=rename_map).copy()
if "date" in normalized.columns:
normalized["date"] = normalized["date"].astype(str)
normalized = normalized.sort_values("date")
return normalized
def _apply_tushare_adjustment(
df: pd.DataFrame,
*,
pro: object,
ts_code: str,
start_date: str,
end_date: str,
adj: str,
) -> pd.DataFrame:
factors = pro.adj_factor(
ts_code=ts_code,
start_date=start_date,
end_date=end_date,
fields="trade_date,adj_factor",
)
if factors is None or factors.empty:
raise RuntimeError(f"tushare adj_factor empty for {ts_code}")
merged = df.merge(factors, on="trade_date", how="left")
merged = merged.sort_values("trade_date")
merged["adj_factor"] = pd.to_numeric(merged["adj_factor"], errors="coerce").bfill()
valid_factors = pd.to_numeric(factors["adj_factor"], errors="coerce").dropna()
if valid_factors.empty:
raise RuntimeError(f"tushare adj_factor invalid for {ts_code}")
latest_factor = float(valid_factors.iloc[0])
for col in ("open", "high", "low", "close"):
merged[col] = pd.to_numeric(merged[col], errors="coerce")
if adj == "hfq":
merged[col] = merged[col] * merged["adj_factor"]
else:
merged[col] = merged[col] * merged["adj_factor"] / latest_factor
return merged.drop(columns=["adj_factor"])
def _normalize_tushare_adj(value: str | None) -> str | None:
text = (value or "").strip().lower()
if text in {"", "none", "null", "no", "false", "0"}:
return None
if text not in {"qfq", "hfq"}:
raise RuntimeError(f"unsupported TUSHARE_DAILY_ADJ: {value}")
return text
def _fetch_daily_baostock(code: str, *, lookback_days: int) -> pd.DataFrame:
"""Fetch daily history via Baostock as a free fallback source.
Baostock uses ``sh.600519`` / ``sz.000001`` style codes and exposes
forward-adjusted prices via ``adjustflag='2'``.
"""
try:
import baostock as bs
except ImportError as exc:
raise RuntimeError("baostock not installed; pip install baostock") from exc
global _BAOSTOCK_OUTAGE_ERROR
if _BAOSTOCK_OUTAGE_ERROR is not None:
raise RuntimeError(_BAOSTOCK_OUTAGE_ERROR)
bs_code = _to_baostock_code(code)
start_date = (datetime.now() - timedelta(days=max(lookback_days * 2, 90))).strftime("%Y-%m-%d")
end_date = datetime.now().strftime("%Y-%m-%d")
with _BAOSTOCK_LOCK:
if _BAOSTOCK_OUTAGE_ERROR is not None:
raise RuntimeError(_BAOSTOCK_OUTAGE_ERROR)
login_result = bs.login()
try:
login_error_code = str(getattr(login_result, "error_code", "0"))
if login_error_code not in {"", "0"}:
login_error_msg = getattr(login_result, "error_msg", "")
raise RuntimeError(f"baostock login error {login_error_code}: {login_error_msg}")
rs = bs.query_history_k_data_plus(
bs_code,
"date,open,high,low,close,volume,amount",
start_date=start_date,
end_date=end_date,
frequency="d",
adjustflag="2",
)
if rs.error_code != "0":
message = f"baostock error {rs.error_code}: {rs.error_msg}"
if _is_baostock_network_outage(rs.error_code, rs.error_msg):
_BAOSTOCK_OUTAGE_ERROR = message
raise RuntimeError(message)
rows = []
while rs.next():
rows.append(rs.get_row_data())
finally:
try:
bs.logout()
except Exception:
pass
if not rows:
raise RuntimeError(f"baostock daily history empty for {code}")
df = pd.DataFrame(rows, columns=["date", "open", "high", "low", "close", "volume", "amount"])
return df.tail(max(lookback_days, 30)).copy()
def _to_baostock_code(code: str) -> str:
raw = str(code).strip().zfill(6)
if raw.startswith(("6", "9", "5")):
return f"sh.{raw}"
return f"sz.{raw}"
def _to_tushare_code(code: str) -> str:
raw = str(code).strip().zfill(6)
if raw.startswith(("4", "8", "920")):
return f"{raw}.BJ"
if raw.startswith(("6", "9", "5")):
return f"{raw}.SH"
return f"{raw}.SZ"
def _to_tencent_code(code: str) -> str:
raw = str(code).strip().zfill(6)
if raw.startswith(("4", "8", "920")):
return f"bj{raw}"
if raw.startswith(("6", "9", "5")):
return f"sh{raw}"
return f"sz{raw}"
def _is_baostock_network_outage(error_code: object, error_msg: object) -> bool:
code = str(error_code)
message = str(error_msg)
return code in {"10002007"} or "网络" in message or "接收" in message
def compute_daily_features(hist: pd.DataFrame) -> dict[str, object]:
"""Compute compact trend/reversal features from a daily K-line DataFrame."""
df = _normalize_daily_history(hist)
if df.empty:
raise RuntimeError("daily history is empty after normalization")
close = pd.to_numeric(df["close"], errors="coerce").dropna()
if close.empty:
raise RuntimeError("daily history has no valid close price")
ma5 = close.rolling(5).mean()
ma20 = close.rolling(20).mean()
ma60 = close.rolling(60).mean()
last_close = float(close.iloc[-1])
last_ma5 = _last_float(ma5)
last_ma20 = _last_float(ma20)
last_ma60 = _last_float(ma60)
shape = _compute_shape_features(df, last_close=last_close, last_ma20=last_ma20)
quality = _compute_daily_quality(hist, df)
lookback_idx = max(0, len(close) - 61)
base_close = float(close.iloc[lookback_idx])
change_60d = (last_close / base_close - 1.0) * 100 if base_close > 0 else None
macd_status = _compute_macd_status(close)
rsi_value = _compute_rsi(close)
rsi_status = _classify_rsi(rsi_value)
ma_bullish = _is_true(last_ma5 is not None and last_ma20 is not None and last_ma60 is not None
and last_ma5 >= last_ma20 >= last_ma60)
price_above_ma20 = _is_true(last_ma20 is not None and last_close >= last_ma20)
signal_score = _compute_signal_score(
change_60d=change_60d,
ma_bullish=ma_bullish,
price_above_ma20=price_above_ma20,
macd_status=macd_status,
rsi_status=rsi_status,
)
return {
"daily_data_points": int(len(close)),
"change_60d": None if change_60d is None else round(float(change_60d), 4),
"ma5": last_ma5,
"ma20": last_ma20,
"ma60": last_ma60,
"ma_bullish": ma_bullish,
"price_above_ma20": price_above_ma20,
"macd_status": macd_status,
"rsi_status": rsi_status,
"rsi14": None if rsi_value is None else round(float(rsi_value), 4),
"signal_score": round(float(signal_score), 4),
**shape,
**quality,
}
def _normalize_daily_history(hist: pd.DataFrame) -> pd.DataFrame:
rename_map = {
"日期": "date",
"收盘": "close",
"开盘": "open",
"最高": "high",
"最低": "low",
"成交量": "volume",
"成交额": "amount",
}
df = hist.rename(columns=rename_map).copy()
if "date" in df.columns:
df = df.sort_values("date")
if "close" not in df.columns:
raise RuntimeError("daily history has no close column")
for col in ("open", "high", "low", "close", "volume"):
if col in df.columns:
df[col] = pd.to_numeric(df[col], errors="coerce")
df = df.dropna(subset=["close"]).copy()
for col in ("open", "high", "low"):
if col not in df.columns:
df[col] = df["close"]
else:
df[col] = df[col].fillna(df["close"])
return df
def _compute_daily_quality(raw: pd.DataFrame, normalized: pd.DataFrame) -> dict[str, object]:
"""Score daily-history quality and expose compact audit flags."""
score = 100.0
flags: list[str] = []
points = len(normalized)
if points < 30:
score -= 35
flags.append("short_history_lt30")
elif points < 60:
score -= 15
flags.append("short_history_lt60")
for col in ("open", "high", "low", "close"):
if col not in normalized.columns:
score -= 20
flags.append(f"missing_{col}")
continue
missing_ratio = float(pd.to_numeric(normalized[col], errors="coerce").isna().mean())
if missing_ratio > 0:
score -= min(missing_ratio * 40, 20)
flags.append(f"incomplete_{col}")
if "volume" not in normalized.columns:
score -= 12
flags.append("missing_volume")
else:
volume = pd.to_numeric(normalized["volume"], errors="coerce")
missing_volume_ratio = float(volume.isna().mean())
if missing_volume_ratio > 0:
score -= min(missing_volume_ratio * 20, 10)
flags.append("incomplete_volume")
if (volume.dropna() < 0).any():
score -= 20
flags.append("negative_volume")
if {"open", "high", "low", "close"}.issubset(normalized.columns):
open_ = pd.to_numeric(normalized["open"], errors="coerce")
high = pd.to_numeric(normalized["high"], errors="coerce")
low = pd.to_numeric(normalized["low"], errors="coerce")
close = pd.to_numeric(normalized["close"], errors="coerce")
invalid_ohlc = (high < low) | (high < open_) | (high < close) | (low > open_) | (low > close)
if invalid_ohlc.fillna(False).any():
score -= 30
flags.append("invalid_ohlc")
if ((open_ <= 0) | (high <= 0) | (low <= 0) | (close <= 0)).fillna(False).any():
score -= 35
flags.append("non_positive_price")
if bool(raw.attrs.get("daily_stale")):
score -= 25
flags.append("stale_cache")
source_errors = list(raw.attrs.get("source_errors", []) or [])
if source_errors:
score -= min(len(source_errors) * 5, 20)
flags.append("fallback_errors")
return {
"daily_quality_score": round(max(score, 0.0), 4),
"daily_quality_flags": ";".join(flags),
}
def _compute_shape_features(
df: pd.DataFrame,
*,
last_close: float,
last_ma20: float | None,
) -> dict[str, object]:
previous = df.iloc[:-1].tail(20)
recent = df.tail(20)
last = df.iloc[-1]
prev_high_20d = _series_max(previous["high"]) if "high" in previous.columns else None
range_20d_pct = _range_pct(recent)
breakout_20d_pct = (
(last_close / prev_high_20d - 1.0) * 100
if prev_high_20d is not None and prev_high_20d > 0
else None
)
volume_ratio_20d = _volume_ratio_20d(df)
body_pct = _body_pct(last)
pullback_to_ma20_pct = (
(last_close / last_ma20 - 1.0) * 100
if last_ma20 is not None and last_ma20 > 0
else None
)
volatility_20d_pct = _volatility_20d_pct(recent["close"])
max_drawdown_20d_pct = _max_drawdown_pct(recent["close"])
atr_20_pct = _atr_20_pct(df)
return {
"prev_high_20d": _round_or_none(prev_high_20d),
"range_20d_pct": _round_or_none(range_20d_pct),
"breakout_20d_pct": _round_or_none(breakout_20d_pct),
"volume_ratio_20d": _round_or_none(volume_ratio_20d),
"body_pct": _round_or_none(body_pct),
"pullback_to_ma20_pct": _round_or_none(pullback_to_ma20_pct),
"consolidation_days_20d": _consolidation_days(previous),
"volatility_20d_pct": _round_or_none(volatility_20d_pct),
"max_drawdown_20d_pct": _round_or_none(max_drawdown_20d_pct),
"atr_20_pct": _round_or_none(atr_20_pct),
}
def _series_max(series: pd.Series) -> float | None:
values = pd.to_numeric(series, errors="coerce").dropna()
if values.empty:
return None
return float(values.max())
def _range_pct(df: pd.DataFrame) -> float | None:
if "high" not in df.columns or "low" not in df.columns:
return None
high = pd.to_numeric(df["high"], errors="coerce").dropna()
low = pd.to_numeric(df["low"], errors="coerce").dropna()
if high.empty or low.empty:
return None
low_min = float(low.min())
if low_min <= 0:
return None
return (float(high.max()) / low_min - 1.0) * 100
def _volume_ratio_20d(df: pd.DataFrame) -> float | None:
if "volume" not in df.columns:
return None
volume = pd.to_numeric(df["volume"], errors="coerce")
if len(volume) < 2 or pd.isna(volume.iloc[-1]):
return None
previous = volume.iloc[:-1].tail(20).dropna()
if previous.empty:
return None
base = float(previous.mean())
if base <= 0:
return None
return float(volume.iloc[-1]) / base
def _volatility_20d_pct(close: pd.Series) -> float | None:
values = pd.to_numeric(close, errors="coerce").dropna()
returns = values.pct_change().dropna()
if len(returns) < 2:
return None
return float(returns.std()) * (252 ** 0.5) * 100
def _max_drawdown_pct(close: pd.Series) -> float | None:
values = pd.to_numeric(close, errors="coerce").dropna()
if values.empty:
return None
running_high = values.cummax()
drawdowns = values / running_high - 1.0
return min(float(drawdowns.min()) * 100, 0.0)
def _atr_20_pct(df: pd.DataFrame) -> float | None:
if not {"high", "low", "close"}.issubset(df.columns):
return None
high = pd.to_numeric(df["high"], errors="coerce")
low = pd.to_numeric(df["low"], errors="coerce")
close = pd.to_numeric(df["close"], errors="coerce")
previous_close = close.shift(1)
true_range = pd.concat([
high - low,
(high - previous_close).abs(),
(low - previous_close).abs(),
], axis=1).max(axis=1)
atr = true_range.tail(20).dropna().mean()
valid_close = close.dropna()
if valid_close.empty:
return None
last_close = float(valid_close.iloc[-1])
if pd.isna(atr) or last_close <= 0:
return None
return float(atr) / last_close * 100
def _consolidation_days(previous: pd.DataFrame, *, max_range_pct: float = 12.0) -> int | None:
if previous.empty or "high" not in previous.columns or "low" not in previous.columns:
return None
for days in range(min(len(previous), 20), 1, -1):
window = previous.tail(days)
range_pct = _range_pct(window)
if range_pct is not None and range_pct <= max_range_pct:
return int(days)
return 0
def _body_pct(row: pd.Series) -> float | None:
open_price = row.get("open")
close_price = row.get("close")
if pd.isna(open_price) or pd.isna(close_price) or float(open_price) <= 0:
return None
return (float(close_price) / float(open_price) - 1.0) * 100
def _round_or_none(value: float | None) -> float | None:
if value is None or pd.isna(value):
return None
return round(float(value), 4)
def _compute_macd_status(close: pd.Series) -> str:
if len(close) < 35:
return "neutral"
ema12 = close.ewm(span=12, adjust=False).mean()
ema26 = close.ewm(span=26, adjust=False).mean()
diff = ema12 - ema26
dea = diff.ewm(span=9, adjust=False).mean()
last_diff = float(diff.iloc[-1])
last_dea = float(dea.iloc[-1])
if last_diff > last_dea and last_diff > 0:
return "bullish"
if last_diff < last_dea and last_diff < 0:
return "bearish"
return "neutral"
def _compute_rsi(close: pd.Series, period: int = 14) -> float | None:
if len(close) <= period:
return None
delta = close.diff()
gain = delta.clip(lower=0).rolling(period).mean()
loss = (-delta.clip(upper=0)).rolling(period).mean()
rs = gain / loss.replace(0, pd.NA)
rsi = 100 - (100 / (1 + rs))
value = rsi.iloc[-1]
if pd.isna(value):
return None
return float(value)
def _classify_rsi(value: float | None) -> str:
if value is None:
return "neutral"
if value <= 35:
return "oversold"
if value >= 70:
return "overbought"
return "neutral"
def _compute_signal_score(
*,
change_60d: float | None,
ma_bullish: bool,
price_above_ma20: bool,
macd_status: str,
rsi_status: str,
) -> float:
score = 50.0
if ma_bullish:
score += 14
if price_above_ma20:
score += 10
if macd_status == "bullish":
score += 12
elif macd_status == "bearish":
score -= 12
if change_60d is not None:
if 0 <= change_60d <= 35:
score += min(change_60d * 0.35, 12)
elif change_60d > 60:
score -= min((change_60d - 60) * 0.20, 12)
elif change_60d < -25:
score -= min(abs(change_60d + 25) * 0.25, 10)
if rsi_status == "oversold":
score += 4
elif rsi_status == "overbought":
score -= 6
return max(0.0, min(score, 100.0))
def _last_float(series: pd.Series) -> float | None:
value = series.iloc[-1]
if pd.isna(value):
return None
return round(float(value), 4)
def _is_true(value: bool) -> bool:
return bool(value)