# -*- coding: utf-8 -*- """Main pipeline — orchestrates L1 → L2 → result.""" import logging import uuid from pathlib import Path import pandas as pd from alphasift.config import Config from alphasift.candidate_context import collect_candidate_context from alphasift.context import build_llm_context from alphasift.daily import enrich_daily_features from alphasift.dsa_provider import apply_dsa_provider_context from alphasift.filter import ( apply_hard_filters, hard_filter_rejection_summary, hard_filter_waterfall, requires_daily_features, without_daily_filters, ) from alphasift.industry import enrich_industry_concepts from alphasift.models import Pick, ScreenResult from alphasift.normalize import ( normalize_code, safe_bool as _safe_bool, safe_float as _safe_float, safe_int as _safe_int, safe_text, ) from alphasift.post_analysis import normalize_post_analyzers, run_post_analyzers from alphasift.ranker import rank_candidates_with_metadata from alphasift.risk import apply_portfolio_overlay, apply_risk_overlay from alphasift.scorer import compute_screen_scores, factor_score_columns from alphasift.snapshot import fetch_snapshot_with_fallback from alphasift.strategy import load_all_strategies logger = logging.getLogger(__name__) def screen( strategy: str, *, market: str = "cn", max_output: int | None = None, use_llm: bool = True, llm_context: str | None = None, llm_context_files: list[str | Path] | None = None, candidate_context_files: list[str | Path] | None = None, collect_llm_candidate_context: bool | None = None, candidate_context_max_candidates: int | None = None, candidate_context_providers: list[str] | None = None, industry_map_files: list[str | Path] | None = None, industry_provider: str | None = None, post_analyzers: list[str] | None = None, post_analysis_max_picks: int | None = None, daily_enrich: bool | None = None, daily_enrich_max_candidates: int | None = None, explain_filters: bool = False, deep_analysis: bool = False, deep_analysis_max_picks: int | None = None, context: dict[str, object] | None = None, config: Config | None = None, ) -> ScreenResult: """Execute stock screening with the given strategy. Args: strategy: Strategy name (matches a YAML file in strategies/). market: Market scope, currently only "cn". max_output: Override max output count from strategy. use_llm: Whether to use LLM for L2 ranking. llm_context: Optional market/news/theme context supplied to the LLM ranker. llm_context_files: Optional text files appended to LLM context. candidate_context_files: Optional CSV/JSON/JSONL files keyed by code with candidate-level context. collect_llm_candidate_context: Whether to fetch Top-K candidate news/fund-flow context for LLM. candidate_context_max_candidates: Max candidates to fetch external context for. candidate_context_providers: Optional provider names: news, fund_flow, announcement. industry_map_files: Optional code->industry/concepts files used before L1/L2. industry_provider: Optional provider for board mapping, e.g. "akshare". post_analyzers: Optional L3 analyzers, e.g. ["scorecard", "dsa"]. post_analysis_max_picks: Override max number of picks sent to post analyzers. daily_enrich: Whether to enrich shortlisted candidates with daily K-line features. daily_enrich_max_candidates: Max candidates to enrich after snapshot filtering. explain_filters: Whether to include sequential hard-filter waterfall diagnostics. deep_analysis: Backward-compatible alias for post_analyzers=["dsa"]. deep_analysis_max_picks: Backward-compatible max-picks alias for DSA. context: Optional host runtime context. DSA may provide LLM settings and callable data providers under context["dsa"]. config: Runtime config. Defaults to Config.from_env(). Returns: ScreenResult with ranked picks. """ if config is None: config = Config.from_env() if market not in ("cn", "us"): raise ValueError(f"Unsupported market: {market!r} (supported: cn, us)") run_id = uuid.uuid4().hex[:12] degradation: list[str] = [] # 1. Load strategy strategies = load_all_strategies(config.strategies_dir) if strategy not in strategies: available = ", ".join(strategies.keys()) or "(none)" raise ValueError(f"Strategy '{strategy}' not found. Available: {available}") strat = strategies[strategy] screening = strat.screening if market not in screening.market_scope: raise ValueError( f"Strategy '{strategy}' does not support market '{market}'. " f"Supported: {', '.join(screening.market_scope)}" ) output_count = max_output or screening.max_output analyzer_names = normalize_post_analyzers( post_analyzers if post_analyzers is not None else config.post_analyzers ) if deep_analysis and "dsa" not in analyzer_names: analyzer_names.append("dsa") analyzer_max_picks = ( post_analysis_max_picks or deep_analysis_max_picks ) daily_needed = requires_daily_features(screening.hard_filters) daily_requested = config.daily_enrich_enabled if daily_enrich is None else daily_enrich daily_limit = daily_enrich_max_candidates or config.daily_enrich_max_candidates snapshot_filters = without_daily_filters(screening.hard_filters) if daily_needed else screening.hard_filters # 2. Fetch snapshot snapshot_df = fetch_snapshot_with_fallback( config.snapshot_source_priority, required_columns=_required_snapshot_columns(snapshot_filters), fallback_snapshot_path=config.fallback_snapshot_path, fallback_max_age_hours=config.snapshot_fallback_max_age_hours, market=market, ) effective_industry_map_files = ( list(industry_map_files) if industry_map_files is not None else list(config.industry_map_files) ) effective_industry_provider = ( industry_provider if industry_provider is not None else config.industry_provider ) effective_industry_provider = str(effective_industry_provider or "none").strip().lower() if effective_industry_map_files or effective_industry_provider not in {"", "none", "off", "false"}: snapshot_df, industry_notes = enrich_industry_concepts( snapshot_df, map_files=effective_industry_map_files, provider=effective_industry_provider, max_boards=config.industry_provider_max_boards, provider_cache_dir=config.industry_provider_cache_dir, provider_cache_ttl_hours=config.industry_provider_cache_ttl_hours, ) degradation.extend(f"Industry/concepts enrichment: {item}" for item in industry_notes) snapshot_count = len(snapshot_df) snapshot_source = str(snapshot_df.attrs.get("snapshot_source", "")) source_errors = [str(item) for item in snapshot_df.attrs.get("source_errors", [])] degradation.extend(f"Snapshot source fallback: {item}" for item in source_errors) if bool(snapshot_df.attrs.get("fallback_used")): stale_age = snapshot_df.attrs.get("stale_age_hours") if stale_age is None: degradation.append("Snapshot source fallback: last_good_cache stale") else: degradation.append( f"Snapshot source fallback: last_good_cache stale_age_hours={stale_age}" ) # 3. L1 hard filter. If a strategy needs daily features, first apply only # snapshot-safe filters, then enrich a narrowed candidate pool. if explain_filters: snapshot_waterfall = hard_filter_waterfall(snapshot_df, snapshot_filters) if snapshot_waterfall: degradation.append( "Snapshot hard-filter waterfall: " + _format_filter_waterfall(snapshot_waterfall) ) df = apply_hard_filters(snapshot_df, snapshot_filters) after_filter_count = len(df) if df.empty: return ScreenResult( strategy=strategy, market=market, snapshot_count=snapshot_count, after_filter_count=0, run_id=run_id, degradation=[*degradation, "No candidates after hard filter"], snapshot_source=snapshot_source, source_errors=source_errors, strategy_version=strat.version, strategy_category=strat.category, post_analyzers=analyzer_names, daily_enriched=False, risk_enabled=config.risk_enabled, portfolio_diversity_enabled=config.portfolio_diversity_enabled, ) daily_enriched = False daily_enrich_count = 0 if daily_needed or daily_requested: provisional = _sort_screened_candidates(compute_screen_scores(df, screening), screening) enrich_count = min(daily_limit, len(provisional)) daily_candidates = provisional.head(enrich_count) try: enriched = enrich_daily_features( daily_candidates, max_rows=enrich_count, lookback_days=config.daily_lookback_days, source=config.daily_source, fetch_retries=config.daily_fetch_retries, max_workers=config.daily_fetch_max_workers, ) daily_enriched = True daily_errors = [str(item) for item in enriched.attrs.get("daily_errors", [])] daily_enrich_count = int(enriched.attrs.get("daily_success_count", len(enriched))) daily_source_counts = dict(enriched.attrs.get("daily_source_counts", {}) or {}) daily_quality_flag_counts = dict(enriched.attrs.get("daily_quality_flag_counts", {}) or {}) daily_source_order_notes = [str(item) for item in enriched.attrs.get("daily_source_order_notes", [])] daily_source_health_notes = _daily_source_health_notes( dict(enriched.attrs.get("daily_source_health", {}) or {}) ) degradation.append( f"Daily K-line enrichment attempted {enrich_count} candidates, " f"succeeded {daily_enrich_count} of {after_filter_count} snapshot-filtered candidates" ) if daily_source_counts: source_summary = ", ".join( f"{name}={count}" for name, count in sorted(daily_source_counts.items()) ) degradation.append(f"Daily K-line sources: {source_summary}") if daily_quality_flag_counts: flag_summary = ", ".join( f"{name}={count}" for name, count in sorted(daily_quality_flag_counts.items()) ) degradation.append(f"Daily K-line quality flags: {flag_summary}") if daily_source_order_notes: degradation.append("Daily K-line source ordering: " + " | ".join(daily_source_order_notes[:3])) if daily_source_health_notes: degradation.append("Daily K-line source health: " + "; ".join(daily_source_health_notes)) if daily_errors: sample = " | ".join(daily_errors[:5]) suffix = f" | +{len(daily_errors) - 5} more" if len(daily_errors) > 5 else "" degradation.append(f"Daily K-line enrichment row errors: {sample}{suffix}") if daily_needed: if explain_filters: daily_waterfall = hard_filter_waterfall(enriched, screening.hard_filters) if daily_waterfall: degradation.append( "Daily hard-filter waterfall: " + _format_filter_waterfall(daily_waterfall) ) daily_filter_rejections = hard_filter_rejection_summary( enriched, screening.hard_filters, limit=6, ) df = apply_hard_filters(enriched, screening.hard_filters) after_filter_count = len(df) if daily_filter_rejections: degradation.append( "Daily hard-filter rejections: " + "; ".join(daily_filter_rejections) ) else: df = enriched except Exception as exc: if daily_needed: raise RuntimeError( "Daily K-line enrichment is required by this strategy but failed: " f"{exc}" ) from exc degradation.append(f"Daily K-line enrichment skipped: {exc}") if df.empty: return ScreenResult( strategy=strategy, market=market, strategy_version=strat.version, strategy_category=strat.category, snapshot_count=snapshot_count, after_filter_count=0, run_id=run_id, degradation=[*degradation, "No candidates after daily hard filter"], snapshot_source=snapshot_source, source_errors=source_errors, post_analyzers=analyzer_names, daily_enriched=daily_enriched, daily_enrich_count=daily_enrich_count, risk_enabled=config.risk_enabled, portfolio_diversity_enabled=config.portfolio_diversity_enabled, ) # 4. Compute screen_score df = _sort_screened_candidates(compute_screen_scores(df, screening), screening) # 5. Take Top K for LLM ranking top_k = min( max(output_count * config.llm_candidate_multiplier, output_count), config.llm_max_candidates, len(df), ) df_top = df.head(top_k) # 6. Build Pick list picks = _df_to_picks(df_top) # 6.5. Host-provided candidate context, e.g. DSA realtime quote, # fundamentals, and news. This runs before LLM ranking so L2 can use it. degradation.extend(apply_dsa_provider_context(picks, context)) # 7. L2 LLM ranking llm_ranked = False llm_market_view = "" llm_selection_logic = "" llm_portfolio_risk = "" llm_coverage: float | None = None llm_parse_errors: list[str] = [] if use_llm and config.has_llm_config(): candidate_context_rows: list[dict[str, object]] = [] event_source_weights = _event_source_weights(screening.event_profile) should_collect_candidate_context = ( config.llm_candidate_context_enabled if collect_llm_candidate_context is None else collect_llm_candidate_context ) if should_collect_candidate_context: candidate_context_rows, candidate_context_errors = collect_candidate_context( df_top, max_rows=( candidate_context_max_candidates or config.llm_candidate_context_max_candidates ), providers=( candidate_context_providers if candidate_context_providers is not None else config.llm_candidate_context_providers ), news_limit=config.llm_candidate_context_news_limit, announcement_limit=config.llm_candidate_context_announcement_limit, cache_dir=( config.data_dir / "candidate_context" if config.llm_candidate_context_cache_enabled else None ), cache_ttl_hours=config.llm_candidate_context_cache_ttl_hours, source_weights=event_source_weights, ) degradation.append( f"Candidate context collected rows={len(candidate_context_rows)}" ) if candidate_context_errors: sample = " | ".join(candidate_context_errors[:5]) suffix = ( f" | +{len(candidate_context_errors) - 5} more" if len(candidate_context_errors) > 5 else "" ) degradation.append(f"Candidate context row errors: {sample}{suffix}") llm_context_degradation: list[str] = [] effective_context = build_llm_context( base_context=llm_context if llm_context is not None else config.llm_context, context_files=llm_context_files, candidate_context_files=candidate_context_files, candidate_context_rows=candidate_context_rows, snapshot_df=snapshot_df, candidate_df=df_top, event_profile=screening.event_profile, max_chars=config.llm_context_max_chars, degradation=llm_context_degradation, ) degradation.extend(llm_context_degradation) llm_prompt_degradation: list[str] = [] llm_result = rank_candidates_with_metadata( picks, screening.ranking_hints, config.llm_api_key, config.llm_model, config.llm_base_url, context=effective_context, rank_weight=config.llm_rank_weight, max_retries=config.llm_max_retries, min_coverage=config.llm_min_coverage, fallback_models=config.llm_fallback_models, temperature=config.llm_temperature, json_mode=config.llm_json_mode, silent=config.llm_silent, channels=config.llm_channels, config_path=str(config.llm_config_path or ""), timeout_sec=config.llm_timeout_sec, max_tokens=config.llm_max_tokens, degradation=llm_prompt_degradation, ) degradation.extend(llm_prompt_degradation) picks = llm_result.picks llm_market_view = llm_result.market_view llm_selection_logic = llm_result.selection_logic llm_portfolio_risk = llm_result.portfolio_risk llm_coverage = llm_result.coverage llm_parse_errors = llm_result.errors llm_ranked = any(p.llm_score is not None for p in picks) if not llm_ranked: degradation.append("LLM ranking failed: fell back to screen_score") for i, p in enumerate(picks): p.rank = i + 1 p.final_score = p.screen_score else: if use_llm and not config.has_llm_config(): degradation.append("LLM ranking skipped: no LLM config") for i, p in enumerate(picks): p.rank = i + 1 p.final_score = p.screen_score # 8. Independent risk overlay if config.risk_enabled: picks, risk_degradation = apply_risk_overlay( picks, max_penalty=config.risk_max_penalty, veto_high_risk=config.risk_veto_high, profile=screening.risk_profile, ) degradation.extend(risk_degradation) # 9. LLM-driven portfolio overlay. This runs before trimming so an # over-crowded sector can make room for a comparable candidate elsewhere. portfolio_concentration_notes: list[str] = [] if config.portfolio_diversity_enabled: picks, portfolio_concentration_notes = apply_portfolio_overlay( picks, max_same_sector=config.portfolio_max_same_llm_sector, concentration_penalty=config.portfolio_concentration_penalty, profile=screening.portfolio_profile, ) # 10. Trim to max_output picks = picks[:output_count] # 11. Optional L3 post-analysis, DSA is only one possible analyzer. if analyzer_names: picks, post_degradation = run_post_analyzers( picks, analyzer_names=analyzer_names, run_id=run_id, config=config, max_picks=analyzer_max_picks, scorecard_profile=screening.scorecard_profile, ) degradation.extend(post_degradation) return ScreenResult( strategy=strategy, market=market, strategy_version=strat.version, strategy_category=strat.category, snapshot_count=snapshot_count, after_filter_count=after_filter_count, picks=picks, run_id=run_id, llm_ranked=llm_ranked, llm_market_view=llm_market_view, llm_selection_logic=llm_selection_logic, llm_portfolio_risk=llm_portfolio_risk, llm_coverage=llm_coverage, llm_parse_errors=llm_parse_errors, degradation=degradation, snapshot_source=snapshot_source, source_errors=source_errors, deep_analysis_requested=("dsa" in analyzer_names), post_analyzers=analyzer_names, daily_enriched=daily_enriched, daily_enrich_count=daily_enrich_count, risk_enabled=config.risk_enabled, portfolio_diversity_enabled=config.portfolio_diversity_enabled, portfolio_concentration_notes=portfolio_concentration_notes, ) def _df_to_picks(df: pd.DataFrame) -> list[Pick]: """Convert DataFrame rows to Pick objects.""" picks = [] factor_cols = factor_score_columns() for i, (_, row) in enumerate(df.iterrows()): factor_scores = { factor: _safe_float(row.get(col)) or 0.0 for factor, col in factor_cols.items() if col in df.columns } picks.append(Pick( rank=i + 1, code=normalize_code(row.get("code", row.get("代码", "")), allow_ticker=True), name=str(row.get("name", row.get("名称", row.get("股票名称", "")))), screen_score=float(row.get("screen_score", 0)), final_score=float(row.get("screen_score", 0)), price=float(row.get("price", row.get("最新价", 0)) or 0), change_pct=float(row.get("change_pct", row.get("涨跌幅", 0)) or 0), amount=float(row.get("amount", row.get("成交额", 0)) or 0), total_mv=_safe_float(row.get("total_mv", row.get("总市值"))), turnover_rate=_safe_float(row.get("turnover_rate", row.get("换手率"))), volume_ratio=_safe_float(row.get("volume_ratio", row.get("量比"))), pe_ratio=_safe_float(row.get("pe_ratio", row.get("市盈率"))), pb_ratio=_safe_float(row.get("pb_ratio", row.get("市净率"))), industry=_safe_text(row.get("industry", row.get("行业", row.get("所属行业", "")))), concepts=_safe_text(row.get("concepts", row.get("概念", row.get("概念题材", "")))), industry_rank=_safe_int(row.get("industry_rank")), industry_change_pct=_safe_float(row.get("industry_change_pct")), industry_heat_score=_safe_float(row.get("industry_heat_score")), concept_heat_score=_safe_float(row.get("concept_heat_score")), board_heat_score=_safe_float(row.get("board_heat_score")), board_heat_latest_score=_safe_float(row.get("board_heat_latest_score")), board_heat_trend_score=_safe_float(row.get("board_heat_trend_score")), board_heat_persistence_score=_safe_float(row.get("board_heat_persistence_score")), board_heat_cooling_score=_safe_float(row.get("board_heat_cooling_score")), board_heat_observations=_safe_int(row.get("board_heat_observations")), board_heat_state=_safe_text(row.get("board_heat_state")), board_heat_summary=_safe_text(row.get("board_heat_summary")), change_60d=_safe_float(row.get("change_60d")), signal_score=_safe_float(row.get("signal_score")), ma_bullish=_safe_bool(row.get("ma_bullish")), price_above_ma20=_safe_bool(row.get("price_above_ma20")), macd_status=str(row.get("macd_status", "") or ""), rsi_status=str(row.get("rsi_status", "") or ""), breakout_20d_pct=_safe_float(row.get("breakout_20d_pct")), range_20d_pct=_safe_float(row.get("range_20d_pct")), volume_ratio_20d=_safe_float(row.get("volume_ratio_20d")), body_pct=_safe_float(row.get("body_pct")), pullback_to_ma20_pct=_safe_float(row.get("pullback_to_ma20_pct")), consolidation_days_20d=_safe_int(row.get("consolidation_days_20d")), volatility_20d_pct=_safe_float(row.get("volatility_20d_pct")), max_drawdown_20d_pct=_safe_float(row.get("max_drawdown_20d_pct")), atr_20_pct=_safe_float(row.get("atr_20_pct")), daily_quality_score=_safe_float(row.get("daily_quality_score")), daily_quality_flags=_safe_text(row.get("daily_quality_flags")), daily_source=_safe_text(row.get("daily_source")), factor_scores=factor_scores, )) return picks def _sort_screened_candidates(df: pd.DataFrame, screening=None) -> pd.DataFrame: """Sort scored candidates deterministically with factor-aware tie breakers.""" factor_order = ["stability", "activity", "momentum", "value"] if screening is not None and screening.factor_weights: factor_order = [ factor for factor, _weight in sorted( screening.factor_weights.items(), key=lambda item: (-float(item[1]), item[0]), ) ] sort_columns = [ column for column in ["screen_score"] + [f"factor_{factor}_score" for factor in factor_order] if column in df.columns ] ascending = [False] * len(sort_columns) if "code" in df.columns: sort_columns.append("code") ascending.append(True) if not sort_columns: return df return df.sort_values(sort_columns, ascending=ascending, kind="mergesort") def _required_snapshot_columns(filters) -> list[str]: columns: list[str] = [] if filters.exclude_st: columns.append("name") if filters.amount_min is not None: columns.append("amount") if filters.price_min is not None or filters.price_max is not None: columns.append("price") if filters.market_cap_min is not None or filters.market_cap_max is not None: columns.append("total_mv") if filters.pe_ttm_min is not None or filters.pe_ttm_max is not None: columns.append("pe_ratio") if filters.pb_min is not None or filters.pb_max is not None: columns.append("pb_ratio") if filters.volume_ratio_min is not None: columns.append("volume_ratio") if filters.turnover_rate_min is not None: columns.append("turnover_rate") if filters.change_pct_min is not None or filters.change_pct_max is not None: columns.append("change_pct") return list(dict.fromkeys(columns)) def _event_source_weights(event_profile: dict[str, object]) -> dict[str, float] | None: value = (event_profile or {}).get("source_weights") if not isinstance(value, dict): return None result: dict[str, float] = {} for key, raw in value.items(): try: result[str(key)] = float(raw) except (TypeError, ValueError): continue return result or None def _daily_source_health_notes(health: dict[str, object], *, limit: int = 4) -> list[str]: source_states: list[tuple[tuple[int, float, float, str], str, dict[object, object]]] = [] for source, raw_state in health.items(): if not isinstance(raw_state, dict): continue failures = _safe_float(raw_state.get("failures")) or 0.0 total_failures = _safe_float(raw_state.get("total_failures")) or 0.0 disabled = bool(raw_state.get("disabled")) if not disabled and failures <= 0 and total_failures <= 0: continue severity_key = ( 0 if disabled else 1 if failures > 0 else 2, -failures, -total_failures, str(source), ) source_states.append((severity_key, str(source), raw_state)) notes: list[str] = [] for _severity_key, source, raw_state in sorted(source_states): failures = _safe_float(raw_state.get("failures")) or 0.0 total_failures = _safe_float(raw_state.get("total_failures")) or 0.0 disabled = bool(raw_state.get("disabled")) last_rows = _safe_float(raw_state.get("last_rows")) or 0.0 parts: list[str] = [] if disabled: parts.append("disabled") if failures > 0: parts.append(f"failures={failures:g}") elif total_failures > 0: parts.append(f"total_failures={total_failures:g}") if last_rows > 0: parts.append(f"last_rows={last_rows:g}") if parts: notes.append(f"{source} " + ",".join(parts)) if len(notes) >= limit: break hidden_count = len(source_states) - len(notes) if hidden_count > 0: notes.append(f"+{hidden_count} more") return notes def _format_filter_waterfall(steps: list[dict[str, object]], *, limit: int = 8) -> str: parts: list[str] = [] for step in steps[:limit]: text = ( f"{step.get('filter')} {step.get('before')}->{step.get('after')} " f"removed={step.get('removed')}" ) samples = step.get("samples") if isinstance(samples, list) and samples: sample_names = [ str(item.get("name") or item.get("code") or item.get("value") or "") for item in samples if isinstance(item, dict) ] sample_names = [item for item in sample_names if item] if sample_names: text += f" samples={','.join(sample_names[:3])}" if step.get("suggestion"): text += f" next={step.get('suggestion')}" parts.append(text) hidden = len(steps) - len(parts) if hidden > 0: parts.append(f"+{hidden} more") return "; ".join(parts) def _safe_text(v: object) -> str: return safe_text(v, max_len=120)