fa45d8aa5f
- health_checklist.json: 192.168.1.122→node122
- ocr_client.py: docstring IP→node122
- docs/market-data-requirements.md: IP→node122
- 所有API调用通过ProxyHandler({})绕过系统代理
Privoxy对node122:18003返回500,直连正常
830 lines
32 KiB
Python
830 lines
32 KiB
Python
# -*- coding: utf-8 -*-
|
||
"""Industry and concept enrichment for candidate snapshots."""
|
||
|
||
from __future__ import annotations
|
||
|
||
import json
|
||
import os
|
||
import time
|
||
from datetime import datetime
|
||
from pathlib import Path
|
||
|
||
import pandas as pd
|
||
|
||
from alphasift.normalize import (
|
||
normalize_code as _normalize_code,
|
||
safe_float as _safe_float,
|
||
safe_text as _safe_text,
|
||
)
|
||
|
||
_PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
||
_AKSHARE_BOARD_CACHE_SCHEMA = "v1"
|
||
_CACHE_DIR_UNSET = object()
|
||
_NUMERIC_FIELDS = (
|
||
"industry_rank",
|
||
"industry_change_pct",
|
||
"industry_heat_score",
|
||
"concept_heat_score",
|
||
"board_heat_score",
|
||
"board_heat_latest_score",
|
||
"board_heat_trend_score",
|
||
"board_heat_persistence_score",
|
||
"board_heat_cooling_score",
|
||
"board_heat_observations",
|
||
)
|
||
_TEXT_FIELDS = ("board_heat_summary", "board_heat_state")
|
||
_HEAT_FIELDS = (*_NUMERIC_FIELDS, *_TEXT_FIELDS)
|
||
_FIELD_ALIASES = {
|
||
"industry_rank": ["industry_rank", "行业排名", "板块排名", "排名"],
|
||
"industry_change_pct": ["industry_change_pct", "行业涨跌幅", "板块涨跌幅", "涨跌幅"],
|
||
"industry_heat_score": ["industry_heat_score", "行业热度分"],
|
||
"concept_heat_score": ["concept_heat_score", "概念热度分"],
|
||
"board_heat_score": ["board_heat_score", "theme_heat_score", "板块热度分", "主题热度分"],
|
||
"board_heat_latest_score": ["board_heat_latest_score", "板块最新热度分", "主题最新热度分"],
|
||
"board_heat_trend_score": ["board_heat_trend_score", "板块热度趋势分", "主题热度趋势分"],
|
||
"board_heat_persistence_score": ["board_heat_persistence_score", "板块热度持续分", "主题热度持续分"],
|
||
"board_heat_cooling_score": ["board_heat_cooling_score", "板块降温分", "主题降温分"],
|
||
"board_heat_observations": ["board_heat_observations", "板块热度观测数", "主题热度观测数"],
|
||
"board_heat_summary": ["board_heat_summary", "theme_heat_summary", "板块热度", "主题热度"],
|
||
"board_heat_state": ["board_heat_state", "板块热度状态", "主题热度状态"],
|
||
}
|
||
|
||
|
||
def enrich_industry_concepts(
|
||
df: pd.DataFrame,
|
||
*,
|
||
map_files: list[str | Path] | None = None,
|
||
provider: str = "none",
|
||
max_boards: int = 80,
|
||
provider_cache_dir: str | Path | None | object = _CACHE_DIR_UNSET,
|
||
provider_cache_ttl_hours: float | None = None,
|
||
) -> tuple[pd.DataFrame, list[str]]:
|
||
"""Attach industry/concepts columns from stable files and optional providers."""
|
||
result = df.copy()
|
||
notes: list[str] = []
|
||
if result.empty or "code" not in result.columns:
|
||
return result, notes
|
||
|
||
if "industry" not in result.columns:
|
||
result["industry"] = ""
|
||
if "concepts" not in result.columns:
|
||
result["concepts"] = ""
|
||
for field in _NUMERIC_FIELDS:
|
||
if field not in result.columns:
|
||
result[field] = pd.NA
|
||
for field in _TEXT_FIELDS:
|
||
if field not in result.columns:
|
||
result[field] = ""
|
||
|
||
mapping: dict[str, dict[str, object]] = {}
|
||
for path_like in map_files or []:
|
||
file_mapping = load_industry_map(path_like)
|
||
trend_mapping, trend_note = _load_companion_board_heat_trends(path_like)
|
||
if trend_mapping:
|
||
_apply_board_heat_trends(file_mapping, trend_mapping)
|
||
_merge_mapping(mapping, file_mapping)
|
||
notes.append(f"industry map loaded: {path_like} rows={len(file_mapping)}")
|
||
if trend_note:
|
||
notes.append(trend_note)
|
||
|
||
if provider and provider.lower() not in {"", "none", "off", "false"}:
|
||
if provider.lower() == "akshare":
|
||
provider_mapping, provider_notes = fetch_akshare_board_map(
|
||
max_boards=max_boards,
|
||
cache_dir=provider_cache_dir,
|
||
cache_ttl_hours=provider_cache_ttl_hours,
|
||
)
|
||
_merge_mapping(mapping, provider_mapping)
|
||
notes.extend(provider_notes)
|
||
else:
|
||
notes.append(f"industry provider skipped: unsupported provider={provider}")
|
||
|
||
if not mapping:
|
||
return result, notes
|
||
|
||
result, filled_industry, filled_concepts, filled_heat = _apply_mapping_to_snapshot(
|
||
result,
|
||
mapping,
|
||
)
|
||
|
||
notes.append(
|
||
"industry/concepts enrichment applied: "
|
||
f"industry={filled_industry}, concepts={filled_concepts}, heat={filled_heat}"
|
||
)
|
||
return result, notes
|
||
|
||
|
||
def load_industry_map(path_like: str | Path) -> dict[str, dict[str, object]]:
|
||
"""Load code -> industry/concepts mapping from CSV, JSON or JSONL."""
|
||
path = Path(path_like)
|
||
if not path.is_file():
|
||
raise FileNotFoundError(f"Industry map file not found: {path}")
|
||
|
||
suffix = path.suffix.lower()
|
||
if suffix == ".csv":
|
||
rows = pd.read_csv(path, dtype=str).fillna("").to_dict(orient="records")
|
||
elif suffix == ".jsonl":
|
||
rows = []
|
||
for line in path.read_text(encoding="utf-8").splitlines():
|
||
line = line.strip()
|
||
if line:
|
||
item = json.loads(line)
|
||
if isinstance(item, dict):
|
||
rows.append(item)
|
||
elif suffix == ".json":
|
||
data = json.loads(path.read_text(encoding="utf-8"))
|
||
if isinstance(data, list):
|
||
rows = [item for item in data if isinstance(item, dict)]
|
||
elif isinstance(data, dict):
|
||
rows = []
|
||
for code, value in data.items():
|
||
if isinstance(value, dict):
|
||
rows.append({"code": code, **value})
|
||
elif isinstance(value, str):
|
||
rows.append({"code": code, "industry": value})
|
||
else:
|
||
rows = []
|
||
else:
|
||
raise ValueError(f"Unsupported industry map format: {path}")
|
||
|
||
mapping: dict[str, dict[str, object]] = {}
|
||
for row in rows:
|
||
code = _normalize_code(row.get("code") or row.get("代码"))
|
||
if not code or code == "000000":
|
||
continue
|
||
industry = _safe_text(row.get("industry") or row.get("行业") or row.get("所属行业"))
|
||
concepts = _safe_text(row.get("concepts") or row.get("概念") or row.get("概念题材"))
|
||
item: dict[str, object] = {
|
||
"industry": industry,
|
||
"concepts": concepts,
|
||
}
|
||
for field in _HEAT_FIELDS:
|
||
value = _first_row_value(row, _FIELD_ALIASES.get(field, [field]))
|
||
if field in _NUMERIC_FIELDS:
|
||
parsed = _safe_float(value)
|
||
if parsed is not None:
|
||
item[field] = int(parsed) if field in {"industry_rank", "board_heat_observations"} else parsed
|
||
else:
|
||
text = _safe_text(value)
|
||
if text:
|
||
item[field] = text
|
||
mapping[code] = item
|
||
return mapping
|
||
|
||
|
||
def fetch_akshare_board_map(
|
||
*,
|
||
max_boards: int = 80,
|
||
cache_dir: str | Path | None | object = _CACHE_DIR_UNSET,
|
||
cache_ttl_seconds: float | None = None,
|
||
cache_ttl_hours: float | None = None,
|
||
) -> tuple[dict[str, dict[str, object]], list[str]]:
|
||
"""Build a code mapping from AkShare industry/concept board constituents.
|
||
|
||
This is intentionally optional because it may require many third-party
|
||
requests. For production, a cached CSV/JSON map is preferred.
|
||
"""
|
||
board_limit = max(int(max_boards), 1)
|
||
notes: list[str] = []
|
||
resolved_cache_dir = _resolve_akshare_board_cache_dir(cache_dir)
|
||
cache_path = (
|
||
_akshare_board_cache_path(resolved_cache_dir, max_boards=board_limit)
|
||
if resolved_cache_dir is not None
|
||
else None
|
||
)
|
||
if cache_path is not None:
|
||
cached_mapping, cache_note = _read_akshare_board_cache(
|
||
cache_path,
|
||
max_boards=board_limit,
|
||
ttl_seconds=_resolve_cache_ttl_seconds(
|
||
cache_ttl_seconds=cache_ttl_seconds,
|
||
cache_ttl_hours=cache_ttl_hours,
|
||
),
|
||
)
|
||
if cache_note:
|
||
notes.append(cache_note)
|
||
if cached_mapping is not None:
|
||
return cached_mapping, notes
|
||
|
||
import akshare as ak
|
||
|
||
mapping: dict[str, dict[str, object]] = {}
|
||
board_specs = [
|
||
("industry", ak.stock_board_industry_name_em, ak.stock_board_industry_cons_em),
|
||
("concepts", ak.stock_board_concept_name_em, ak.stock_board_concept_cons_em),
|
||
]
|
||
for field, list_func, cons_func in board_specs:
|
||
try:
|
||
boards = list_func()
|
||
except Exception as exc:
|
||
notes.append(f"akshare {field} board list failed: {exc}")
|
||
continue
|
||
board_items = _board_items(boards)[:board_limit]
|
||
loaded = 0
|
||
for board_item in board_items:
|
||
board = board_item["name"]
|
||
try:
|
||
members = cons_func(symbol=board)
|
||
except Exception as exc:
|
||
notes.append(f"akshare {field} board skipped {board}: {exc}")
|
||
continue
|
||
heat_score = _board_heat_score(
|
||
change_pct=_safe_float(board_item.get("change_pct")),
|
||
rank=_safe_float(board_item.get("rank")),
|
||
)
|
||
heat_summary = _board_heat_summary(
|
||
board,
|
||
change_pct=_safe_float(board_item.get("change_pct")),
|
||
rank=_safe_float(board_item.get("rank")),
|
||
)
|
||
for _, row in members.iterrows():
|
||
code = _normalize_code(row.get("代码") or row.get("code"))
|
||
if not code or code == "000000":
|
||
continue
|
||
item = mapping.setdefault(code, {"industry": "", "concepts": ""})
|
||
if field == "industry" and not item["industry"]:
|
||
item["industry"] = board
|
||
if board_item.get("rank") is not None:
|
||
item["industry_rank"] = int(float(board_item["rank"]))
|
||
if board_item.get("change_pct") is not None:
|
||
item["industry_change_pct"] = _safe_float(board_item.get("change_pct"))
|
||
item["industry_heat_score"] = heat_score
|
||
elif field == "concepts":
|
||
item["concepts"] = _merge_label_text(item.get("concepts", ""), board)
|
||
item["concept_heat_score"] = _max_numeric(item.get("concept_heat_score"), heat_score)
|
||
item["board_heat_score"] = _max_numeric(item.get("board_heat_score"), heat_score)
|
||
item["board_heat_summary"] = _merge_summary_text(
|
||
_safe_text(item.get("board_heat_summary")),
|
||
heat_summary,
|
||
)
|
||
loaded += 1
|
||
notes.append(f"akshare {field} boards loaded: {loaded}/{len(board_items)}")
|
||
if cache_path is not None and mapping:
|
||
cache_note = _write_akshare_board_cache(cache_path, mapping, max_boards=board_limit)
|
||
if cache_note:
|
||
notes.append(cache_note)
|
||
return mapping, notes
|
||
|
||
|
||
def save_industry_map(mapping: dict[str, dict[str, object]], path_like: str | Path) -> Path:
|
||
"""Persist a code->industry/concepts mapping as CSV or JSON."""
|
||
path = Path(path_like)
|
||
path.parent.mkdir(parents=True, exist_ok=True)
|
||
rows = [
|
||
{
|
||
"code": code,
|
||
"industry": item.get("industry", ""),
|
||
"concepts": item.get("concepts", ""),
|
||
"industry_rank": item.get("industry_rank", ""),
|
||
"industry_change_pct": item.get("industry_change_pct", ""),
|
||
"industry_heat_score": item.get("industry_heat_score", ""),
|
||
"concept_heat_score": item.get("concept_heat_score", ""),
|
||
"board_heat_score": item.get("board_heat_score", ""),
|
||
"board_heat_latest_score": item.get("board_heat_latest_score", ""),
|
||
"board_heat_trend_score": item.get("board_heat_trend_score", ""),
|
||
"board_heat_persistence_score": item.get("board_heat_persistence_score", ""),
|
||
"board_heat_cooling_score": item.get("board_heat_cooling_score", ""),
|
||
"board_heat_observations": item.get("board_heat_observations", ""),
|
||
"board_heat_summary": item.get("board_heat_summary", ""),
|
||
"board_heat_state": item.get("board_heat_state", ""),
|
||
}
|
||
for code, item in sorted(mapping.items())
|
||
]
|
||
if path.suffix.lower() == ".json":
|
||
path.write_text(json.dumps(rows, ensure_ascii=False, indent=2), encoding="utf-8")
|
||
else:
|
||
pd.DataFrame(rows).to_csv(path, index=False, encoding="utf-8")
|
||
return path
|
||
|
||
|
||
def _apply_mapping_to_snapshot(
|
||
result: pd.DataFrame,
|
||
mapping: dict[str, dict[str, object]],
|
||
) -> tuple[pd.DataFrame, int, int, int]:
|
||
map_df = _mapping_dataframe(mapping)
|
||
if map_df.empty:
|
||
return result, 0, 0, 0
|
||
|
||
output = result.copy()
|
||
work = output.copy()
|
||
work["__industry_row"] = range(len(work))
|
||
work["__industry_code"] = work["code"].map(_normalize_code)
|
||
merged = work.merge(map_df, on="__industry_code", how="left", sort=False)
|
||
merged = merged.sort_values("__industry_row", kind="stable")
|
||
merged.index = output.index
|
||
|
||
filled_industry = _apply_industry_column(output, merged)
|
||
filled_concepts = _apply_concepts_column(output, merged)
|
||
filled_heat = 0
|
||
for field in _NUMERIC_FIELDS:
|
||
filled_heat += _apply_numeric_column(output, merged, field)
|
||
for field in _TEXT_FIELDS:
|
||
filled_heat += _apply_text_column(output, merged, field)
|
||
return output, filled_industry, filled_concepts, filled_heat
|
||
|
||
|
||
def _mapping_dataframe(mapping: dict[str, dict[str, object]]) -> pd.DataFrame:
|
||
fields = ("industry", "concepts", *_HEAT_FIELDS)
|
||
rows: list[dict[str, object]] = []
|
||
for code, item in mapping.items():
|
||
normalized = _normalize_code(code)
|
||
if not normalized or normalized == "000000" or not isinstance(item, dict):
|
||
continue
|
||
row = {"__industry_code": normalized}
|
||
for field in fields:
|
||
row[f"__map_{field}"] = item.get(field, pd.NA)
|
||
rows.append(row)
|
||
if not rows:
|
||
return pd.DataFrame(columns=["__industry_code", *(f"__map_{field}" for field in fields)])
|
||
frame = pd.DataFrame(rows)
|
||
return frame.drop_duplicates(subset=["__industry_code"], keep="last")
|
||
|
||
|
||
def _apply_industry_column(output: pd.DataFrame, merged: pd.DataFrame) -> int:
|
||
current = output["industry"].map(_safe_text)
|
||
incoming = merged["__map_industry"].map(_safe_text)
|
||
mask = current.eq("") & incoming.ne("")
|
||
if mask.any():
|
||
output.loc[mask, "industry"] = incoming[mask].to_numpy()
|
||
return int(mask.sum())
|
||
|
||
|
||
def _apply_concepts_column(output: pd.DataFrame, merged: pd.DataFrame) -> int:
|
||
current = output["concepts"].map(_safe_text)
|
||
incoming = merged["__map_concepts"].map(_safe_text)
|
||
candidate_mask = incoming.ne("")
|
||
if not candidate_mask.any():
|
||
return 0
|
||
merged_values = pd.Series(
|
||
[
|
||
_merge_label_text(left, right) if right else left
|
||
for left, right in zip(current.tolist(), incoming.tolist(), strict=False)
|
||
],
|
||
index=output.index,
|
||
)
|
||
mask = candidate_mask & merged_values.ne(current)
|
||
if mask.any():
|
||
output.loc[mask, "concepts"] = merged_values[mask].to_numpy()
|
||
return int(mask.sum())
|
||
|
||
|
||
def _apply_numeric_column(output: pd.DataFrame, merged: pd.DataFrame, field: str) -> int:
|
||
incoming = merged[f"__map_{field}"].map(_safe_float)
|
||
current = output[field].map(_safe_float)
|
||
mask = _numeric_replacement_mask(field, incoming, current)
|
||
if not mask.any():
|
||
return 0
|
||
values = incoming[mask]
|
||
if field in {"industry_rank", "board_heat_observations"}:
|
||
values = values.map(int)
|
||
output.loc[mask, field] = values.to_numpy()
|
||
return int(mask.sum())
|
||
|
||
|
||
def _apply_text_column(output: pd.DataFrame, merged: pd.DataFrame, field: str) -> int:
|
||
current = output[field].map(_safe_text)
|
||
incoming = merged[f"__map_{field}"].map(_safe_text)
|
||
candidate_mask = incoming.ne("")
|
||
if not candidate_mask.any():
|
||
return 0
|
||
if field == "board_heat_summary":
|
||
merged_values = pd.Series(
|
||
[
|
||
_merge_summary_text(left, right) if right else left
|
||
for left, right in zip(current.tolist(), incoming.tolist(), strict=False)
|
||
],
|
||
index=output.index,
|
||
)
|
||
else:
|
||
merged_values = pd.Series(
|
||
[left or right for left, right in zip(current.tolist(), incoming.tolist(), strict=False)],
|
||
index=output.index,
|
||
)
|
||
mask = candidate_mask & merged_values.ne(current)
|
||
if mask.any():
|
||
output.loc[mask, field] = merged_values[mask].to_numpy()
|
||
return int(mask.sum())
|
||
|
||
|
||
def _numeric_replacement_mask(field: str, incoming: pd.Series, current: pd.Series) -> pd.Series:
|
||
candidate_mask = incoming.notna()
|
||
missing_mask = current.isna()
|
||
comparable_mask = candidate_mask & ~missing_mask
|
||
wins = pd.Series(False, index=incoming.index)
|
||
if comparable_mask.any():
|
||
new_values = incoming[comparable_mask].astype(float)
|
||
current_values = current[comparable_mask].astype(float)
|
||
if field == "industry_rank":
|
||
wins.loc[comparable_mask] = new_values < current_values
|
||
elif field == "board_heat_observations":
|
||
wins.loc[comparable_mask] = new_values > current_values
|
||
elif field in {"board_heat_latest_score", "board_heat_persistence_score", "board_heat_cooling_score"}:
|
||
wins.loc[comparable_mask] = new_values > current_values
|
||
elif field == "board_heat_trend_score":
|
||
wins.loc[comparable_mask] = new_values.abs() > current_values.abs()
|
||
elif field.endswith("heat_score"):
|
||
wins.loc[comparable_mask] = new_values > current_values
|
||
return candidate_mask & (missing_mask | wins)
|
||
|
||
|
||
def _resolve_akshare_board_cache_dir(cache_dir: str | Path | None | object) -> Path | None:
|
||
if cache_dir is _CACHE_DIR_UNSET:
|
||
return _default_akshare_board_cache_dir()
|
||
if cache_dir is None:
|
||
return None
|
||
return Path(cache_dir)
|
||
|
||
|
||
def _default_akshare_board_cache_dir() -> Path:
|
||
explicit = (
|
||
os.getenv("ALPHASIFT_INDUSTRY_PROVIDER_CACHE_DIR", "").strip()
|
||
or os.getenv("INDUSTRY_PROVIDER_CACHE_DIR", "").strip()
|
||
)
|
||
if explicit:
|
||
return Path(explicit)
|
||
data_dir = Path(os.getenv("ALPHASIFT_DATA_DIR", str(_PROJECT_ROOT / "data")))
|
||
return data_dir / "industry_provider_cache"
|
||
|
||
|
||
def _resolve_cache_ttl_seconds(
|
||
*,
|
||
cache_ttl_seconds: float | None,
|
||
cache_ttl_hours: float | None,
|
||
) -> float:
|
||
if cache_ttl_seconds is not None:
|
||
return float(cache_ttl_seconds)
|
||
if cache_ttl_hours is not None:
|
||
return float(cache_ttl_hours) * 3600
|
||
raw_hours = (
|
||
os.getenv("ALPHASIFT_INDUSTRY_PROVIDER_CACHE_TTL_HOURS", "").strip()
|
||
or os.getenv("INDUSTRY_PROVIDER_CACHE_TTL_HOURS", "").strip()
|
||
or "24"
|
||
)
|
||
return max(0.0, float(raw_hours)) * 3600
|
||
|
||
|
||
def _akshare_board_cache_path(cache_dir: Path, *, max_boards: int) -> Path:
|
||
return cache_dir / f"akshare_board_map_{_AKSHARE_BOARD_CACHE_SCHEMA}_max_boards_{int(max_boards)}.json"
|
||
|
||
|
||
def _read_akshare_board_cache(
|
||
path: Path,
|
||
*,
|
||
max_boards: int,
|
||
ttl_seconds: float,
|
||
) -> tuple[dict[str, dict[str, object]] | None, str]:
|
||
try:
|
||
stat = path.stat()
|
||
except FileNotFoundError:
|
||
return None, ""
|
||
if ttl_seconds <= 0:
|
||
return None, f"industry provider cache expired: {path}"
|
||
age_seconds = time.time() - stat.st_mtime
|
||
if age_seconds > ttl_seconds:
|
||
return None, f"industry provider cache expired: {path}"
|
||
try:
|
||
payload = json.loads(path.read_text(encoding="utf-8"))
|
||
except Exception as exc:
|
||
return None, f"industry provider cache skipped: {path} error={exc}"
|
||
if not isinstance(payload, dict):
|
||
return None, f"industry provider cache skipped: {path} invalid payload"
|
||
if (
|
||
payload.get("schema") != _AKSHARE_BOARD_CACHE_SCHEMA
|
||
or payload.get("provider") != "akshare"
|
||
or int(payload.get("max_boards", 0) or 0) != int(max_boards)
|
||
):
|
||
return None, f"industry provider cache skipped: {path} schema mismatch"
|
||
mapping = _normalize_cached_mapping(payload.get("mapping"))
|
||
if mapping is None:
|
||
return None, f"industry provider cache skipped: {path} invalid mapping"
|
||
return mapping, f"industry provider cache hit: {path} rows={len(mapping)}"
|
||
|
||
|
||
def _write_akshare_board_cache(
|
||
path: Path,
|
||
mapping: dict[str, dict[str, object]],
|
||
*,
|
||
max_boards: int,
|
||
) -> str:
|
||
try:
|
||
path.parent.mkdir(parents=True, exist_ok=True)
|
||
payload = {
|
||
"schema": _AKSHARE_BOARD_CACHE_SCHEMA,
|
||
"provider": "akshare",
|
||
"max_boards": int(max_boards),
|
||
"created_at": datetime.now().isoformat(),
|
||
"mapping": _json_safe_mapping(mapping),
|
||
}
|
||
tmp_path = path.with_name(f".{path.name}.{time.time_ns()}.tmp")
|
||
tmp_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
|
||
tmp_path.replace(path)
|
||
return f"industry provider cache saved: {path} rows={len(mapping)}"
|
||
except Exception as exc:
|
||
return f"industry provider cache skipped: {path} error={exc}"
|
||
|
||
|
||
def _normalize_cached_mapping(value: object) -> dict[str, dict[str, object]] | None:
|
||
if not isinstance(value, dict):
|
||
return None
|
||
mapping: dict[str, dict[str, object]] = {}
|
||
for code, raw_item in value.items():
|
||
normalized = _normalize_code(code)
|
||
if not normalized or normalized == "000000" or not isinstance(raw_item, dict):
|
||
continue
|
||
item: dict[str, object] = {
|
||
"industry": _safe_text(raw_item.get("industry")),
|
||
"concepts": _safe_text(raw_item.get("concepts")),
|
||
}
|
||
for field in _NUMERIC_FIELDS:
|
||
parsed = _safe_float(raw_item.get(field))
|
||
if parsed is not None:
|
||
item[field] = int(parsed) if field in {"industry_rank", "board_heat_observations"} else parsed
|
||
for field in _TEXT_FIELDS:
|
||
text = _safe_text(raw_item.get(field))
|
||
if text:
|
||
item[field] = text
|
||
mapping[normalized] = item
|
||
return mapping
|
||
|
||
|
||
def _json_safe_mapping(mapping: dict[str, dict[str, object]]) -> dict[str, dict[str, object]]:
|
||
return {
|
||
code: _json_safe_item(item)
|
||
for code, item in sorted(mapping.items())
|
||
if isinstance(item, dict)
|
||
}
|
||
|
||
|
||
def _json_safe_item(item: dict[str, object]) -> dict[str, object]:
|
||
cleaned: dict[str, object] = {
|
||
"industry": _safe_text(item.get("industry")),
|
||
"concepts": _safe_text(item.get("concepts")),
|
||
}
|
||
for field in _NUMERIC_FIELDS:
|
||
value = _safe_float(item.get(field))
|
||
if value is not None:
|
||
cleaned[field] = int(value) if field in {"industry_rank", "board_heat_observations"} else value
|
||
for field in _TEXT_FIELDS:
|
||
text = _safe_text(item.get(field))
|
||
if text:
|
||
cleaned[field] = text
|
||
return cleaned
|
||
|
||
|
||
def _board_names(df: pd.DataFrame) -> list[str]:
|
||
for column in ("板块名称", "名称", "name"):
|
||
if column in df.columns:
|
||
return [_safe_text(item) for item in df[column].tolist() if _safe_text(item)]
|
||
return []
|
||
|
||
|
||
def _board_items(df: pd.DataFrame) -> list[dict[str, object]]:
|
||
items: list[dict[str, object]] = []
|
||
for idx, row in df.iterrows():
|
||
name = _first_row_value(row, ["板块名称", "名称", "name"])
|
||
if not _safe_text(name):
|
||
continue
|
||
rank = _safe_float(_first_row_value(row, ["排名", "序号", "rank"]))
|
||
if rank is None:
|
||
rank = float(idx + 1)
|
||
change_pct = _safe_float(_first_row_value(row, ["涨跌幅", "涨幅", "change_pct"]))
|
||
items.append({
|
||
"name": _safe_text(name),
|
||
"rank": rank,
|
||
"change_pct": change_pct,
|
||
})
|
||
return items
|
||
|
||
|
||
def _merge_mapping(target: dict[str, dict[str, object]], source: dict[str, dict[str, object]]) -> None:
|
||
for code, item in source.items():
|
||
existing = target.setdefault(code, {"industry": "", "concepts": ""})
|
||
if item.get("industry") and not existing.get("industry"):
|
||
existing["industry"] = item["industry"]
|
||
if item.get("concepts"):
|
||
existing["concepts"] = _merge_label_text(existing.get("concepts", ""), item["concepts"])
|
||
if item.get("board_heat_summary"):
|
||
existing["board_heat_summary"] = _merge_summary_text(
|
||
_safe_text(existing.get("board_heat_summary")),
|
||
item.get("board_heat_summary", ""),
|
||
)
|
||
if item.get("board_heat_state") and not existing.get("board_heat_state"):
|
||
existing["board_heat_state"] = item["board_heat_state"]
|
||
for field in _NUMERIC_FIELDS:
|
||
value = _safe_float(item.get(field))
|
||
if value is None:
|
||
continue
|
||
current = _safe_float(existing.get(field))
|
||
if current is None or _should_replace_numeric(field, value, current):
|
||
existing[field] = int(value) if field in {"industry_rank", "board_heat_observations"} else value
|
||
|
||
|
||
def _load_companion_board_heat_trends(path_like: str | Path) -> tuple[dict[str, dict[str, object]], str]:
|
||
path = Path(path_like)
|
||
history_path = path.with_suffix(path.suffix + ".history.jsonl")
|
||
if not history_path.is_file():
|
||
return {}, ""
|
||
try:
|
||
trends = load_board_heat_trends(history_path)
|
||
except Exception as exc:
|
||
return {}, f"board heat trends skipped: {history_path} error={exc}"
|
||
return trends, f"board heat trends loaded: {history_path} boards={len(trends)}"
|
||
|
||
|
||
def load_board_heat_trends(
|
||
path_like: str | Path,
|
||
*,
|
||
window_size: int = 5,
|
||
hot_score: float = 60.0,
|
||
cooling_threshold: float = 5.0,
|
||
) -> dict[str, dict[str, object]]:
|
||
"""Load board heat trend stats from an industry-cache history JSONL file."""
|
||
path = Path(path_like)
|
||
if not path.is_file():
|
||
raise FileNotFoundError(f"Board heat history file not found: {path}")
|
||
grouped: dict[str, list[dict[str, object]]] = {}
|
||
for line in path.read_text(encoding="utf-8").splitlines():
|
||
if not line.strip():
|
||
continue
|
||
try:
|
||
item = json.loads(line)
|
||
except json.JSONDecodeError:
|
||
continue
|
||
if not isinstance(item, dict):
|
||
continue
|
||
board = _safe_text(item.get("board"))
|
||
heat = _safe_float(item.get("max_board_heat_score"))
|
||
if not board or heat is None:
|
||
continue
|
||
if heat < 0 or heat > 100:
|
||
continue
|
||
grouped.setdefault(board, []).append({
|
||
"generated_at": _safe_text(item.get("generated_at")),
|
||
"heat": heat,
|
||
})
|
||
|
||
trends: dict[str, dict[str, object]] = {}
|
||
for board, rows in grouped.items():
|
||
ordered = sorted(rows, key=lambda item: str(item.get("generated_at", "")))
|
||
recent = ordered[-max(int(window_size), 1):]
|
||
heat_values = [
|
||
heat
|
||
for heat in (_safe_float(item.get("heat")) for item in recent)
|
||
if heat is not None
|
||
]
|
||
if not heat_values:
|
||
continue
|
||
first = heat_values[0]
|
||
last = heat_values[-1]
|
||
if first is None or last is None:
|
||
continue
|
||
previous = heat_values[-2] if len(heat_values) >= 2 else last
|
||
trend_score = last - first
|
||
cooling_score = max(previous - last, 0.0)
|
||
persistence_score = sum(1 for heat in heat_values if heat >= hot_score) / len(heat_values) * 100
|
||
trends[board] = {
|
||
"board_heat_latest_score": round(last, 4),
|
||
"board_heat_trend_score": round(trend_score, 4),
|
||
"board_heat_persistence_score": round(persistence_score, 4),
|
||
"board_heat_cooling_score": round(cooling_score, 4),
|
||
"board_heat_observations": len(heat_values),
|
||
"board_heat_state": _board_heat_state(
|
||
trend_score=trend_score,
|
||
cooling_score=cooling_score,
|
||
persistence_score=persistence_score,
|
||
hot_score=hot_score,
|
||
cooling_threshold=cooling_threshold,
|
||
),
|
||
}
|
||
return trends
|
||
|
||
|
||
def _apply_board_heat_trends(
|
||
mapping: dict[str, dict[str, object]],
|
||
trends: dict[str, dict[str, object]],
|
||
) -> None:
|
||
for item in mapping.values():
|
||
boards = _summary_boards(item.get("board_heat_summary", ""))
|
||
matches = [trends[board] for board in boards if board in trends]
|
||
if not matches:
|
||
continue
|
||
best = max(
|
||
matches,
|
||
key=lambda trend: (
|
||
int(trend.get("board_heat_observations", 0) or 0),
|
||
_safe_float(trend.get("board_heat_latest_score")) or 0.0,
|
||
abs(_safe_float(trend.get("board_heat_trend_score")) or 0.0),
|
||
),
|
||
)
|
||
for field in (
|
||
"board_heat_latest_score",
|
||
"board_heat_trend_score",
|
||
"board_heat_persistence_score",
|
||
"board_heat_cooling_score",
|
||
"board_heat_observations",
|
||
"board_heat_state",
|
||
):
|
||
if field in best:
|
||
item[field] = best.get(field)
|
||
|
||
|
||
def _summary_boards(value: object) -> list[str]:
|
||
boards = []
|
||
for summary in _merge_summary_text("", value).split("|"):
|
||
board = summary.strip().split(":", 1)[0].strip()
|
||
if board:
|
||
boards.append(board)
|
||
return boards
|
||
|
||
|
||
def _merge_label_text(left: str, right: str) -> str:
|
||
labels: list[str] = []
|
||
seen = set()
|
||
for raw in (left, right):
|
||
for item in str(raw or "").replace(",", ",").replace("、", ",").split(","):
|
||
label = item.strip()
|
||
if label and label.lower() not in {"nan", "none", "<na>"} and label not in seen:
|
||
seen.add(label)
|
||
labels.append(label)
|
||
return ",".join(labels)
|
||
|
||
|
||
def _merge_summary_text(left: object, right: object, *, limit: int = 8) -> str:
|
||
labels: list[str] = []
|
||
seen = set()
|
||
for raw in (left, right):
|
||
for item in str(raw or "").replace("\n", " | ").split("|"):
|
||
label = item.strip()
|
||
if label and label.lower() not in {"nan", "none", "<na>"} and label not in seen:
|
||
seen.add(label)
|
||
labels.append(label)
|
||
return " | ".join(labels[:limit])
|
||
|
||
|
||
def _first_row_value(row: dict | pd.Series, columns: list[str]) -> object:
|
||
for column in columns:
|
||
if column in row:
|
||
return row.get(column)
|
||
return None
|
||
|
||
|
||
def _max_numeric(left: object, right: object) -> float | None:
|
||
left_num = _safe_float(left)
|
||
right_num = _safe_float(right)
|
||
if left_num is None:
|
||
return right_num
|
||
if right_num is None:
|
||
return left_num
|
||
return max(left_num, right_num)
|
||
|
||
|
||
def _should_replace_numeric(field: str, new_value: float, current_value: float) -> bool:
|
||
if field == "industry_rank":
|
||
return new_value < current_value
|
||
if field == "board_heat_observations":
|
||
return new_value > current_value
|
||
if field in {"board_heat_latest_score", "board_heat_persistence_score", "board_heat_cooling_score"}:
|
||
return new_value > current_value
|
||
if field == "board_heat_trend_score":
|
||
return abs(new_value) > abs(current_value)
|
||
if field.endswith("heat_score"):
|
||
return new_value > current_value
|
||
return False
|
||
|
||
|
||
def _board_heat_state(
|
||
*,
|
||
trend_score: float,
|
||
cooling_score: float,
|
||
persistence_score: float,
|
||
hot_score: float,
|
||
cooling_threshold: float,
|
||
) -> str:
|
||
if cooling_score >= cooling_threshold:
|
||
return "cooling"
|
||
if trend_score >= cooling_threshold:
|
||
return "warming"
|
||
if persistence_score >= 66.6667 and hot_score > 0:
|
||
return "persistent_hot"
|
||
if trend_score <= -cooling_threshold:
|
||
return "weakening"
|
||
return "flat"
|
||
|
||
|
||
def _board_heat_score(*, change_pct: float | None, rank: float | None) -> float:
|
||
score = 50.0
|
||
if change_pct is not None:
|
||
score += change_pct * 6.0
|
||
if rank is not None and rank > 0:
|
||
score += max(0.0, 12.0 - min(rank, 12.0))
|
||
return round(max(0.0, min(score, 100.0)), 4)
|
||
|
||
|
||
def _board_heat_summary(board: str, *, change_pct: float | None, rank: float | None) -> str:
|
||
parts = [board]
|
||
if change_pct is not None:
|
||
parts.append(f"{change_pct:+.2f}%")
|
||
if rank is not None:
|
||
parts.append(f"rank={int(rank)}")
|
||
return ":".join(parts)
|