# -*- coding: utf-8 -*- """Context assembly for LLM ranking.""" from __future__ import annotations from dataclasses import dataclass import json from pathlib import Path import pandas as pd from alphasift.normalize import normalize_code as _normalize_code _CONTEXT_TRIM_MARKER = "[context_trimmed]" _CANDIDATE_CONTEXT_COLUMNS = { "news": "新闻", "announcement": "公告", "announcements": "公告", "fund_flow": "资金流", "fundflow": "资金流", "quote": "行情估值", "summary": "摘要", "context": "上下文", "context_summary": "压缩摘要", "text": "文本", "risk": "风险", "catalyst": "催化", "source_count": "来源数", "source_confidence": "来源置信度", "source_weight_score": "来源权重分", "event_tags": "事件标签", "announcement_categories": "公告类别", "negative_event_flags": "负面风险", } @dataclass(frozen=True) class _ContextSection: text: str kind: str priority: int min_chars: int weight: int line_aware: bool = False def build_llm_context( *, base_context: str = "", context_files: list[str | Path] | None = None, candidate_context_files: list[str | Path] | None = None, candidate_context_rows: list[dict[str, object]] | None = None, snapshot_df: pd.DataFrame | None = None, candidate_df: pd.DataFrame | None = None, event_profile: dict[str, object] | None = None, max_chars: int = 4000, degradation: list[str] | None = None, ) -> str: """Build bounded context text for the LLM soft ranker.""" sections: list[_ContextSection] = [] if base_context.strip(): sections.append(_section( "【人工上下文】\n" + base_context.strip(), kind="market_context", priority=5, min_chars=80, weight=1, line_aware=True, )) file_context = _read_context_files(context_files or []) if file_context: sections.append(_section( "【上下文文件】\n" + file_context, kind="market_files", priority=5, min_chars=80, weight=1, line_aware=True, )) event_profile_context = summarize_event_profile(event_profile) if event_profile_context: sections.append(_section( event_profile_context, kind="event_profile", priority=2, min_chars=160, weight=2, line_aware=True, )) candidate_identity = summarize_candidate_identity(candidate_df) if candidate_identity: sections.append(_section( candidate_identity, kind="candidate_identity", priority=0, min_chars=360, weight=4, line_aware=True, )) candidate_external_context = _read_candidate_context_files( candidate_context_files or [], candidate_df, ) if candidate_external_context: sections.append(_section( "【候选外部线索】\n" + candidate_external_context, kind="candidate_external_context", priority=1, min_chars=320, weight=4, line_aware=True, )) collected_candidate_context = _format_candidate_context_rows( candidate_context_rows or [], candidate_df, ) if collected_candidate_context: sections.append(_section( "【候选抓取线索】\n" + collected_candidate_context, kind="candidate_collected_context", priority=1, min_chars=360, weight=4, line_aware=True, )) snapshot_context = summarize_snapshot_context(snapshot_df, title="全市场快照") if snapshot_context: sections.append(_section( snapshot_context, kind="market_snapshot", priority=4, min_chars=160, weight=2, line_aware=True, )) candidate_context = summarize_snapshot_context(candidate_df, title="候选池快照") if candidate_context: sections.append(_section( candidate_context, kind="candidate_snapshot", priority=2, min_chars=180, weight=2, line_aware=True, )) candidate_profile = summarize_candidate_profile(candidate_df) if candidate_profile: sections.append(_section( candidate_profile, kind="candidate_profile", priority=2, min_chars=240, weight=3, line_aware=True, )) return _join_bounded_context_sections(sections, max_chars=max_chars, degradation=degradation) def summarize_snapshot_context(df: pd.DataFrame | None, *, title: str) -> str: """Summarize breadth, activity and extremes from a snapshot DataFrame.""" if df is None or df.empty: return "" lines = [f"【{title}】", f"样本数: {len(df)}"] if "change_pct" in df.columns: change = pd.to_numeric(df["change_pct"], errors="coerce").dropna() if not change.empty: positive_ratio = (change > 0).mean() * 100 lines.append( "涨跌分布: " f"上涨占比 {positive_ratio:.1f}%, " f"中位涨跌幅 {change.median():.2f}%, " f"平均涨跌幅 {change.mean():.2f}%" ) lines.append("涨跌极值: " + _format_extremes(df, change, ascending=False)) if "amount" in df.columns: amount = pd.to_numeric(df["amount"], errors="coerce").dropna() if not amount.empty: lines.append(f"成交额中位数: {amount.median():.0f}") if "volume_ratio" in df.columns: volume_ratio = pd.to_numeric(df["volume_ratio"], errors="coerce").dropna() if not volume_ratio.empty: hot_ratio = (volume_ratio >= 2).mean() * 100 lines.append(f"量比>=2占比: {hot_ratio:.1f}%") return "\n".join(lines) def summarize_candidate_profile(df: pd.DataFrame | None) -> str: """Summarize factor conflicts and leadership inside the candidate pool.""" if df is None or df.empty: return "" lines = ["【候选池结构】"] factor_cols = { "价值": "factor_value_score", "流动性": "factor_liquidity_score", "动量": "factor_momentum_score", "反转": "factor_reversal_score", "活跃度": "factor_activity_score", "稳定性": "factor_stability_score", "市值容量": "factor_size_score", "主题热度": "factor_theme_heat_score", } available = {label: col for label, col in factor_cols.items() if col in df.columns} if available: averages = [] for label, col in available.items(): series = pd.to_numeric(df[col], errors="coerce").dropna() if not series.empty: averages.append(f"{label}{series.mean():.1f}") if averages: lines.append("因子均值: " + ",".join(averages)) leaders = [] columns = [col for col in ("code", "name") if col in df.columns] for label, col in available.items(): series = pd.to_numeric(df[col], errors="coerce") if series.dropna().empty or not columns: continue idx = series.idxmax() row = df.loc[idx] leaders.append(f"{label}:{row.get('code', '')}{row.get('name', '')}({series.loc[idx]:.1f})") if leaders: lines.append("因子领先: " + ",".join(leaders[:8])) if "screen_score" in df.columns: screen_score = pd.to_numeric(df["screen_score"], errors="coerce").dropna() if not screen_score.empty: lines.append( f"主评分分布: 最高{screen_score.max():.1f}," f"中位{screen_score.median():.1f},最低{screen_score.min():.1f}" ) industry_summary = _summarize_label_distribution(df, "industry") if industry_summary: lines.append("行业分布: " + industry_summary) concept_summary = _summarize_label_distribution(df, "concepts") if concept_summary: lines.append("概念线索: " + concept_summary) heat_summary = _summarize_board_heat(df) if heat_summary: lines.append("板块/主题热度: " + heat_summary) return "\n".join(lines) if len(lines) > 1 else "" def summarize_candidate_identity(df: pd.DataFrame | None, *, limit: int = 30) -> str: """Summarize top candidate identities and key ranking fields.""" if df is None or df.empty or "code" not in df.columns: return "" lines = ["【候选身份】"] for _, row in df.head(max(int(limit), 1)).iterrows(): code = _normalize_code(row.get("code", row.get("代码", ""))) if not code: continue name = _safe_context_value( row.get("name") or row.get("名称") or row.get("股票名称"), max_len=40, ) fields = [] score = _safe_context_value(row.get("screen_score"), max_len=24) if score: fields.append(f"screen_score={score}") industry = _safe_context_value(row.get("industry") or row.get("行业"), max_len=40) if industry: fields.append(f"industry={industry}") concepts = _safe_context_value(row.get("concepts") or row.get("概念"), max_len=80) if concepts: fields.append(f"concepts={concepts}") heat = _safe_context_value(row.get("board_heat_score"), max_len=24) if heat: fields.append(f"board_heat_score={heat}") suffix = ": " + ", ".join(fields) if fields else "" lines.append(f"- {code} {name}{suffix}") if len(df) > limit: lines.append(f"...[candidate_identity_omitted:{len(df) - limit}]") return "\n".join(lines) if len(lines) > 1 else "" def summarize_event_profile(event_profile: dict[str, object] | None) -> str: """Summarize strategy-level event preferences for the LLM.""" if not event_profile: return "" lines = ["【策略事件偏好】"] field_labels = { "preferred_event_tags": "偏好事件标签", "avoided_event_tags": "规避事件标签", "preferred_announcement_categories": "偏好公告类别", "avoided_announcement_categories": "规避公告类别", "notes": "事件备注", } for field, label in field_labels.items(): value = _format_profile_value(event_profile.get(field)) if value: lines.append(f"{label}: {value}") source_weights = event_profile.get("source_weights") if isinstance(source_weights, dict) and source_weights: items = [] for source, weight in source_weights.items(): text = _safe_context_value(source, max_len=40) if not text: continue try: items.append(f"{text}={float(weight):.2f}") except (TypeError, ValueError): continue if items: lines.append("来源权重: " + ",".join(items)) return "\n".join(lines) if len(lines) > 1 else "" def _read_context_files(paths: list[str | Path]) -> str: chunks: list[str] = [] for path_like in paths: path = Path(path_like) if not path.is_file(): raise FileNotFoundError(f"Context file not found: {path}") text = path.read_text(encoding="utf-8").strip() if text: chunks.append(f"# {path.name}\n{text}") return "\n\n".join(chunks) def _read_candidate_context_files( paths: list[str | Path], candidate_df: pd.DataFrame | None, ) -> str: if not paths or candidate_df is None or candidate_df.empty or "code" not in candidate_df.columns: return "" candidate_names, candidate_order = _candidate_maps(candidate_df) candidate_codes = set(candidate_names) chunks: list[tuple[int, int, str]] = [] row_position = 0 for path_like in paths: path = Path(path_like) if not path.is_file(): raise FileNotFoundError(f"Candidate context file not found: {path}") rows = _load_candidate_context_rows(path) for row in rows: code = _normalize_code(row.get("code", row.get("代码", ""))) item = _format_candidate_context_row(row, candidate_codes, candidate_names) if item: chunks.append((candidate_order.get(code, len(candidate_order)), row_position, item)) row_position += 1 return "\n".join(item for _, _, item in sorted(chunks)) def _format_candidate_context_rows( rows: list[dict[str, object]], candidate_df: pd.DataFrame | None, ) -> str: if not rows or candidate_df is None or candidate_df.empty or "code" not in candidate_df.columns: return "" candidate_names, candidate_order = _candidate_maps(candidate_df) candidate_codes = set(candidate_names) chunks = [] for idx, row in enumerate(rows): code = _normalize_code(row.get("code", row.get("代码", ""))) item = _format_candidate_context_row(row, candidate_codes, candidate_names) if item: chunks.append((candidate_order.get(code, len(candidate_order)), idx, item)) return "\n".join(item for _, _, item in sorted(chunks)) def _candidate_maps(candidate_df: pd.DataFrame) -> tuple[dict[str, str], dict[str, int]]: candidate_names: dict[str, str] = {} candidate_order: dict[str, int] = {} for idx, (_, row) in enumerate(candidate_df.iterrows()): code = _normalize_code(row.get("code", row.get("代码", ""))) if not code: continue candidate_names[code] = str(row.get("name", row.get("名称", "")) or "") candidate_order.setdefault(code, idx) return candidate_names, candidate_order def _format_candidate_context_row( row: dict[str, object], candidate_codes: set[str], candidate_names: dict[str, str], ) -> str: code = _normalize_code(row.get("code", row.get("代码", ""))) if code not in candidate_codes: return "" fields = [] for column, label in _CANDIDATE_CONTEXT_COLUMNS.items(): value = _safe_context_value(row.get(column)) if value: fields.append(f"{label}:{value}") if not fields: return "" name = _safe_context_value(row.get("name") or row.get("名称")) or candidate_names.get(code, "") return f"- {code} {name}: " + ";".join(fields) def _load_candidate_context_rows(path: Path) -> list[dict[str, object]]: suffix = path.suffix.lower() if suffix == ".csv": return pd.read_csv(path, dtype=str).fillna("").to_dict(orient="records") if suffix == ".jsonl": rows = [] for line in path.read_text(encoding="utf-8").splitlines(): line = line.strip() if line: item = json.loads(line) if isinstance(item, dict): rows.append(item) return rows if suffix == ".json": data = json.loads(path.read_text(encoding="utf-8")) if isinstance(data, list): return [item for item in data if isinstance(item, dict)] if isinstance(data, dict): items = data.get("items") or data.get("data") if isinstance(items, list): return [item for item in items if isinstance(item, dict)] rows = [] for code, value in data.items(): if isinstance(value, dict): rows.append({"code": code, **value}) elif isinstance(value, str): rows.append({"code": code, "text": value}) return rows raise ValueError(f"Unsupported candidate context file format: {path}") def _safe_context_value(value: object, *, max_len: int = 280) -> str: if value is None: return "" if isinstance(value, list): text = ",".join(str(item).strip() for item in value if str(item).strip()) else: text = str(value).strip() if not text or text.lower() in {"nan", "none", ""}: return "" return text[:max_len] def _format_profile_value(value: object) -> str: if isinstance(value, list): return ",".join( item for item in (_safe_context_value(raw, max_len=80) for raw in value) if item ) return _safe_context_value(value, max_len=280) def _section( text: str, *, kind: str, priority: int, min_chars: int, weight: int, line_aware: bool = False, ) -> _ContextSection: return _ContextSection( text=text.strip(), kind=kind, priority=priority, min_chars=max(int(min_chars), 0), weight=max(int(weight), 1), line_aware=line_aware, ) def _join_bounded_context_sections( sections: list[_ContextSection], *, max_chars: int, degradation: list[str] | None, ) -> str: sections = [section for section in sections if section.text.strip()] combined = "\n\n".join(section.text for section in sections).strip() if not combined: return "" if len(combined) <= max_chars: return combined marker_line = f"【上下文降级】{_CONTEXT_TRIM_MARKER} low-priority context trimmed." budget = max(int(max_chars) - len(marker_line) - 2, 0) if budget <= 0: return marker_line[:max_chars] allocations = _allocate_section_budgets(sections, budget) chunks: list[str] = [] trimmed_kinds: list[str] = [] for section, limit in zip(sections, allocations, strict=False): trimmed = _trim_section(section, limit) if trimmed: chunks.append(trimmed) if len(trimmed) < len(section.text): trimmed_kinds.append(section.kind) result = "\n\n".join(chunks).strip() if trimmed_kinds: trimmed_labels = ",".join(dict.fromkeys(trimmed_kinds)) marker_line = ( f"【上下文降级】{_CONTEXT_TRIM_MARKER} " f"trimmed={trimmed_labels}" ) result = _append_marker_within_limit(result, marker_line, max_chars=max_chars) if degradation is not None: degradation.append(f"LLM context truncated: trimmed={trimmed_labels}") return result[:max_chars] def _allocate_section_budgets(sections: list[_ContextSection], budget: int) -> list[int]: separator_budget = max(len(sections) - 1, 0) * 2 body_budget = max(budget - separator_budget, 0) minimums = [min(len(section.text), section.min_chars) for section in sections] minimum_total = sum(minimums) if minimum_total > body_budget: return _priority_floor_allocations(sections, body_budget) allocations = list(minimums) remaining = body_budget - minimum_total while remaining > 0: expandable = [ idx for idx, section in enumerate(sections) if allocations[idx] < len(section.text) ] if not expandable: break total_weight = sum(sections[idx].weight for idx in expandable) progressed = False for idx in expandable: section = sections[idx] extra = len(section.text) - allocations[idx] share = max(1, int(remaining * section.weight / max(total_weight, 1))) take = min(extra, share, remaining) if take <= 0: continue allocations[idx] += take remaining -= take progressed = True if remaining <= 0: break if not progressed: break return allocations def _priority_floor_allocations(sections: list[_ContextSection], budget: int) -> list[int]: allocations = [0] * len(sections) remaining = max(int(budget), 0) for idx in sorted(range(len(sections)), key=lambda item: sections[item].priority): if remaining <= 0: break section = sections[idx] floor = min(len(section.text), max(section.min_chars, 80)) take = min(floor, remaining) allocations[idx] = take remaining -= take return allocations def _trim_section(section: _ContextSection, limit: int) -> str: if limit <= 0: return "" text = section.text if len(text) <= limit: return text marker = f"\n...{_CONTEXT_TRIM_MARKER}:{section.kind}" if limit <= len(marker) + 8: return text[:limit].rstrip() content_limit = limit - len(marker) if not section.line_aware: return text[:content_limit].rstrip() + marker kept: list[str] = [] for line in text.splitlines(): candidate = "\n".join([*kept, line]).rstrip() + marker if len(candidate) > limit: prefix = "\n".join(kept).rstrip() separator = "\n" if prefix else "" remaining = limit - len(prefix) - len(separator) - len(marker) if remaining > 8: kept.append(line[:remaining].rstrip()) break kept.append(line) if not kept: return text[:content_limit].rstrip() + marker return "\n".join(kept).rstrip() + marker def _append_marker_within_limit(text: str, marker: str, *, max_chars: int) -> str: if not text: return marker[:max_chars] candidate = f"{text}\n{marker}" if len(candidate) <= max_chars: return candidate keep = max(max_chars - len(marker) - 1, 0) if keep <= 0: return marker[:max_chars] return text[:keep].rstrip() + "\n" + marker def _format_extremes(df: pd.DataFrame, change: pd.Series, *, ascending: bool) -> str: columns = [col for col in ("code", "name", "change_pct") if col in df.columns] if not columns: return "" top = df.loc[change.sort_values(ascending=ascending).head(3).index, columns] items = [] for _, row in top.iterrows(): code = str(row.get("code", "")) name = str(row.get("name", "")) pct = row.get("change_pct", 0) items.append(f"{code}{name}({float(pct):.2f}%)") return ", ".join(items) def _summarize_label_distribution(df: pd.DataFrame, column: str) -> str: if column not in df.columns: return "" labels: list[str] = [] for raw in df[column].dropna().astype(str): for item in raw.replace(",", ",").replace("、", ",").split(","): label = item.strip() if label and label.lower() not in {"nan", "none", ""}: labels.append(label) if not labels: return "" counts = pd.Series(labels).value_counts().head(6) return ",".join(f"{label}{count}" for label, count in counts.items()) def _summarize_board_heat(df: pd.DataFrame, *, limit: int = 5) -> str: if "board_heat_score" not in df.columns: return "" values = pd.to_numeric(df["board_heat_score"], errors="coerce") if values.dropna().empty: return "" items = [] for idx in values.sort_values(ascending=False).dropna().head(limit).index: row = df.loc[idx] code = str(row.get("code", "") or "") name = str(row.get("name", "") or "") label = str(row.get("board_heat_summary", "") or row.get("industry", "") or "") trend = _safe_context_value(row.get("board_heat_trend_score"), max_len=20) persistence = _safe_context_value(row.get("board_heat_persistence_score"), max_len=20) cooling = _safe_context_value(row.get("board_heat_cooling_score"), max_len=20) state = _safe_context_value(row.get("board_heat_state"), max_len=20) trend_text = f",trend={trend}" if trend else "" persistence_text = f",persist={persistence}" if persistence else "" cooling_text = f",cooling={cooling}" if cooling else "" state_text = f",state={state}" if state else "" items.append( f"{code}{name}:{float(values.loc[idx]):.1f}" f"{trend_text}{persistence_text}{cooling_text}{state_text}({label[:60]})" ) return ",".join(items)