fa45d8aa5f
- health_checklist.json: 192.168.1.122→node122
- ocr_client.py: docstring IP→node122
- docs/market-data-requirements.md: IP→node122
- 所有API调用通过ProxyHandler({})绕过系统代理
Privoxy对node122:18003返回500,直连正常
504 lines
21 KiB
Python
504 lines
21 KiB
Python
# -*- coding: utf-8 -*-
|
||
"""screen_score calculation."""
|
||
|
||
import pandas as pd
|
||
|
||
from alphasift.models import ScreeningConfig
|
||
|
||
_FACTOR_COLUMNS = {
|
||
"value": "factor_value_score",
|
||
"liquidity": "factor_liquidity_score",
|
||
"momentum": "factor_momentum_score",
|
||
"reversal": "factor_reversal_score",
|
||
"activity": "factor_activity_score",
|
||
"stability": "factor_stability_score",
|
||
"size": "factor_size_score",
|
||
"theme_heat": "factor_theme_heat_score",
|
||
"topic_alignment": "factor_topic_alignment_score",
|
||
}
|
||
_DEFAULT_SCORING_PROFILE = {
|
||
"momentum_base": 60.0,
|
||
"momentum_intraday_slope": 5.0,
|
||
"momentum_chase_start_pct": 5.0,
|
||
"momentum_chase_penalty_slope": 10.0,
|
||
"momentum_downside_start_pct": -2.0,
|
||
"momentum_downside_penalty_slope": 3.0,
|
||
"momentum_60d_base": 55.0,
|
||
"momentum_60d_slope": 0.9,
|
||
"momentum_60d_overheat_pct": 45.0,
|
||
"momentum_60d_overheat_penalty_slope": 0.8,
|
||
"momentum_60d_breakdown_pct": -20.0,
|
||
"momentum_60d_breakdown_penalty_slope": 0.7,
|
||
"macd_bullish_bonus": 6.0,
|
||
"macd_bearish_penalty": 8.0,
|
||
"reversal_ideal_change_pct": -3.0,
|
||
"reversal_distance_penalty_slope": 13.0,
|
||
"reversal_collapse_start_pct": -8.0,
|
||
"reversal_collapse_penalty_slope": 10.0,
|
||
"reversal_chase_start_pct": 1.0,
|
||
"reversal_chase_penalty_slope": 8.0,
|
||
"rsi_oversold_bonus": 10.0,
|
||
"rsi_overbought_penalty": 14.0,
|
||
"activity_ideal_volume_ratio": 2.0,
|
||
"activity_volume_ratio_distance_slope": 15.0,
|
||
"activity_high_volume_ratio": 5.0,
|
||
"activity_high_volume_ratio_penalty_slope": 8.0,
|
||
"activity_ideal_turnover_rate": 4.0,
|
||
"activity_turnover_distance_slope": 8.0,
|
||
"activity_high_turnover_rate": 12.0,
|
||
"activity_high_turnover_penalty_slope": 5.0,
|
||
"stability_base": 78.0,
|
||
"stability_change_abs_penalty_slope": 3.0,
|
||
"stability_hot_change_pct": 7.0,
|
||
"stability_hot_change_penalty_slope": 5.0,
|
||
"stability_high_turnover_rate": 10.0,
|
||
"stability_high_turnover_penalty_slope": 2.0,
|
||
"stability_high_volume_ratio": 5.0,
|
||
"stability_high_volume_ratio_penalty_slope": 4.0,
|
||
"stability_invalid_pe_penalty": 18.0,
|
||
"stability_high_volatility_pct": 45.0,
|
||
"stability_high_volatility_penalty_slope": 0.45,
|
||
"stability_max_drawdown_floor_pct": -12.0,
|
||
"stability_drawdown_penalty_slope": 1.2,
|
||
"stability_high_atr_pct": 6.0,
|
||
"stability_high_atr_penalty_slope": 2.0,
|
||
"stability_low_daily_quality_score": 80.0,
|
||
"stability_low_daily_quality_penalty_slope": 0.35,
|
||
"stability_bad_daily_quality_flag_penalty": 8.0,
|
||
"theme_heat_unknown_score": 50.0,
|
||
"theme_heat_change_slope": 6.0,
|
||
"theme_heat_rank_bonus": 10.0,
|
||
"theme_heat_trend_min_observations": 2.0,
|
||
"theme_heat_trend_slope": 0.8,
|
||
"theme_heat_trend_bonus_cap": 10.0,
|
||
"theme_heat_cooling_penalty_slope": 0.8,
|
||
"theme_heat_cooling_penalty_cap": 12.0,
|
||
"theme_heat_persistence_min_score": 60.0,
|
||
"theme_heat_persistence_slope": 0.08,
|
||
"theme_heat_persistence_bonus_cap": 6.0,
|
||
"theme_heat_cooling_score_penalty_slope": 0.6,
|
||
"theme_heat_cooling_score_penalty_cap": 10.0,
|
||
"theme_heat_overheat_score": 88.0,
|
||
"theme_heat_overheat_penalty_slope": 0.5,
|
||
"topic_alignment_unknown_score": 50.0,
|
||
"topic_alignment_match_bonus": 25.0,
|
||
"topic_alignment_heat_weight": 0.25,
|
||
"topic_alignment_unmatched_penalty": 12.0,
|
||
}
|
||
|
||
|
||
def compute_screen_scores(df: pd.DataFrame, config: ScreeningConfig) -> pd.DataFrame:
|
||
"""Compute screen_score for each candidate row.
|
||
|
||
Adds a 'screen_score' column (0-100). Higher is better.
|
||
"""
|
||
result = df.copy()
|
||
factors = _compute_factor_scores(result, config)
|
||
for name, series in factors.items():
|
||
result[_FACTOR_COLUMNS[name]] = series.round(4)
|
||
|
||
weights = _normalized_factor_weights(config)
|
||
result["screen_score"] = 0.0
|
||
for factor, weight in weights.items():
|
||
if factor in factors:
|
||
result["screen_score"] += factors[factor] * weight
|
||
|
||
result["screen_score"] = result["screen_score"].clip(0, 100)
|
||
|
||
return result
|
||
|
||
|
||
def factor_score_columns() -> dict[str, str]:
|
||
"""Return the stable factor-score column mapping used in Pick output."""
|
||
return dict(_FACTOR_COLUMNS)
|
||
|
||
|
||
def _normalized_factor_weights(config: ScreeningConfig) -> dict[str, float]:
|
||
"""Use explicit factor weights, or derive a sane legacy default from tech_weight."""
|
||
raw_weights = config.factor_weights or {
|
||
"value": (1 - config.tech_weight) * 0.50,
|
||
"liquidity": (1 - config.tech_weight) * 0.25,
|
||
"stability": (1 - config.tech_weight) * 0.25,
|
||
"momentum": config.tech_weight * 0.55,
|
||
"activity": config.tech_weight * 0.45,
|
||
}
|
||
weights = {
|
||
factor: max(float(weight), 0.0)
|
||
for factor, weight in raw_weights.items()
|
||
if factor in _FACTOR_COLUMNS
|
||
}
|
||
total = sum(weights.values())
|
||
if total <= 0:
|
||
return {"value": 0.4, "liquidity": 0.2, "momentum": 0.2, "activity": 0.2}
|
||
return {factor: weight / total for factor, weight in weights.items()}
|
||
|
||
|
||
def _compute_factor_scores(df: pd.DataFrame, config: ScreeningConfig | None = None) -> dict[str, pd.Series]:
|
||
config = config or ScreeningConfig()
|
||
profile = _scoring_profile(config)
|
||
return {
|
||
"value": _compute_value_score(df),
|
||
"liquidity": _compute_liquidity_score(df),
|
||
"momentum": _compute_momentum_score(df, profile),
|
||
"reversal": _compute_reversal_score(df, profile),
|
||
"activity": _compute_activity_score(df, profile),
|
||
"stability": _compute_stability_score(df, profile),
|
||
"size": _compute_size_score(df),
|
||
"theme_heat": _compute_theme_heat_score(df, profile),
|
||
"topic_alignment": _compute_topic_alignment_score(df, profile),
|
||
}
|
||
|
||
|
||
def _scoring_profile(config: ScreeningConfig) -> dict[str, float]:
|
||
profile = dict(_DEFAULT_SCORING_PROFILE)
|
||
for key, value in (config.scoring_profile or {}).items():
|
||
if key in profile:
|
||
profile[key] = float(value)
|
||
return profile
|
||
|
||
|
||
def _compute_snapshot_score(df: pd.DataFrame) -> pd.Series:
|
||
"""Score based on snapshot fundamentals (0-100).
|
||
|
||
Components:
|
||
- PE ratio: lower is better (for value), normalized
|
||
- PB ratio: lower is better, normalized
|
||
- Turnover rate: moderate is best
|
||
- Amount (liquidity): higher is better, log-scaled
|
||
- Change pct: near zero or moderate positive preferred
|
||
"""
|
||
factors = _compute_factor_scores(df)
|
||
return (
|
||
factors["value"] * 0.50
|
||
+ factors["liquidity"] * 0.25
|
||
+ factors["stability"] * 0.25
|
||
).clip(0, 100)
|
||
|
||
|
||
def _compute_tech_score(df: pd.DataFrame) -> pd.Series:
|
||
"""Score based on technical features (0-100).
|
||
|
||
Uses available columns like volume_ratio, change_pct patterns.
|
||
Full tech scoring (MA structure, MACD/RSI) needs daily data,
|
||
which is not in the snapshot — scored conservatively here.
|
||
"""
|
||
factors = _compute_factor_scores(df)
|
||
return (factors["momentum"] * 0.55 + factors["activity"] * 0.45).clip(0, 100)
|
||
|
||
|
||
def _compute_value_score(df: pd.DataFrame) -> pd.Series:
|
||
score = pd.Series(50.0, index=df.index)
|
||
|
||
if "pe_ratio" in df.columns:
|
||
pe = pd.to_numeric(df["pe_ratio"], errors="coerce")
|
||
pe_score = _rank_score(pe.where((pe > 0) & (pe < 500)), lower_is_better=True, na_score=25)
|
||
score = score * 0.35 + pe_score * 0.65
|
||
|
||
if "pb_ratio" in df.columns:
|
||
pb = pd.to_numeric(df["pb_ratio"], errors="coerce")
|
||
pb_score = _rank_score(pb.where((pb > 0) & (pb < 50)), lower_is_better=True, na_score=25)
|
||
score = score * 0.55 + pb_score * 0.45
|
||
|
||
return score.clip(0, 100)
|
||
|
||
|
||
def _compute_liquidity_score(df: pd.DataFrame) -> pd.Series:
|
||
if "amount" not in df.columns:
|
||
return pd.Series(50.0, index=df.index)
|
||
|
||
import numpy as np
|
||
|
||
amount = pd.to_numeric(df["amount"], errors="coerce")
|
||
log_amount = np.log10(amount.clip(lower=1))
|
||
return _rank_score(log_amount.where(amount > 0), lower_is_better=False, na_score=20)
|
||
|
||
|
||
def _compute_momentum_score(df: pd.DataFrame, profile: dict[str, float]) -> pd.Series:
|
||
score = pd.Series(50.0, index=df.index)
|
||
|
||
if "change_pct" in df.columns:
|
||
change = pd.to_numeric(df["change_pct"], errors="coerce").fillna(0)
|
||
# Prefer constructive positive moves, but penalize chase-risk near limit-up.
|
||
intraday_score = profile["momentum_base"] + change * profile["momentum_intraday_slope"]
|
||
intraday_score = intraday_score - (
|
||
change - profile["momentum_chase_start_pct"]
|
||
).clip(lower=0) * profile["momentum_chase_penalty_slope"]
|
||
intraday_score = intraday_score - (
|
||
-change + profile["momentum_downside_start_pct"]
|
||
).clip(lower=0) * profile["momentum_downside_penalty_slope"]
|
||
score = score * 0.35 + intraday_score.clip(5, 100) * 0.65
|
||
|
||
if "change_60d" in df.columns:
|
||
change_60d = pd.to_numeric(df["change_60d"], errors="coerce").fillna(0)
|
||
trend_score = profile["momentum_60d_base"] + change_60d * profile["momentum_60d_slope"]
|
||
trend_score = trend_score - (
|
||
change_60d - profile["momentum_60d_overheat_pct"]
|
||
).clip(lower=0) * profile["momentum_60d_overheat_penalty_slope"]
|
||
trend_score = trend_score - (
|
||
-change_60d + profile["momentum_60d_breakdown_pct"]
|
||
).clip(lower=0) * profile["momentum_60d_breakdown_penalty_slope"]
|
||
score = score * 0.60 + trend_score.clip(5, 100) * 0.40
|
||
|
||
if "signal_score" in df.columns:
|
||
signal = pd.to_numeric(df["signal_score"], errors="coerce").fillna(50)
|
||
score = score * 0.70 + signal.clip(0, 100) * 0.30
|
||
|
||
if "macd_status" in df.columns:
|
||
macd = df["macd_status"].astype(str)
|
||
score = score + macd.map({
|
||
"bullish": profile["macd_bullish_bonus"],
|
||
"bearish": -profile["macd_bearish_penalty"],
|
||
}).fillna(0)
|
||
|
||
return score.clip(5, 100)
|
||
|
||
|
||
def _compute_reversal_score(df: pd.DataFrame, profile: dict[str, float]) -> pd.Series:
|
||
if "change_pct" not in df.columns:
|
||
return pd.Series(50.0, index=df.index)
|
||
|
||
change = pd.to_numeric(df["change_pct"], errors="coerce").fillna(0)
|
||
# Reversal setups prefer controlled weakness, not collapse.
|
||
score = 100 - (
|
||
change - profile["reversal_ideal_change_pct"]
|
||
).abs() * profile["reversal_distance_penalty_slope"]
|
||
score = score - (
|
||
-change + profile["reversal_collapse_start_pct"]
|
||
).clip(lower=0) * profile["reversal_collapse_penalty_slope"]
|
||
score = score - (
|
||
change - profile["reversal_chase_start_pct"]
|
||
).clip(lower=0) * profile["reversal_chase_penalty_slope"]
|
||
|
||
if "rsi_status" in df.columns:
|
||
rsi = df["rsi_status"].astype(str)
|
||
score = score + rsi.map({
|
||
"oversold": profile["rsi_oversold_bonus"],
|
||
"overbought": -profile["rsi_overbought_penalty"],
|
||
}).fillna(0)
|
||
if "change_60d" in df.columns:
|
||
change_60d = pd.to_numeric(df["change_60d"], errors="coerce").fillna(0)
|
||
score = score - (change_60d - 35).clip(lower=0) * 0.5
|
||
score = score - (-change_60d - 35).clip(lower=0) * 0.8
|
||
return score.clip(5, 100)
|
||
|
||
|
||
def _compute_activity_score(df: pd.DataFrame, profile: dict[str, float]) -> pd.Series:
|
||
score = pd.Series(50.0, index=df.index)
|
||
|
||
if "volume_ratio" in df.columns:
|
||
volume_ratio = pd.to_numeric(df["volume_ratio"], errors="coerce").fillna(1.0)
|
||
vr_score = 100 - (
|
||
volume_ratio - profile["activity_ideal_volume_ratio"]
|
||
).abs() * profile["activity_volume_ratio_distance_slope"]
|
||
vr_score = vr_score - (
|
||
volume_ratio - profile["activity_high_volume_ratio"]
|
||
).clip(lower=0) * profile["activity_high_volume_ratio_penalty_slope"]
|
||
score = score * 0.45 + vr_score.clip(5, 100) * 0.55
|
||
|
||
if "turnover_rate" in df.columns:
|
||
turnover = pd.to_numeric(df["turnover_rate"], errors="coerce").fillna(0)
|
||
turnover_score = 100 - (
|
||
turnover - profile["activity_ideal_turnover_rate"]
|
||
).abs() * profile["activity_turnover_distance_slope"]
|
||
turnover_score = turnover_score - (
|
||
turnover - profile["activity_high_turnover_rate"]
|
||
).clip(lower=0) * profile["activity_high_turnover_penalty_slope"]
|
||
turnover_score = turnover_score.where(turnover > 0, 40)
|
||
score = score * 0.55 + turnover_score.clip(5, 100) * 0.45
|
||
|
||
return score.clip(0, 100)
|
||
|
||
|
||
def _compute_stability_score(df: pd.DataFrame, profile: dict[str, float]) -> pd.Series:
|
||
score = pd.Series(profile["stability_base"], index=df.index)
|
||
|
||
if "change_pct" in df.columns:
|
||
change = pd.to_numeric(df["change_pct"], errors="coerce").fillna(0)
|
||
score -= change.abs().clip(upper=10) * profile["stability_change_abs_penalty_slope"]
|
||
score -= (
|
||
change - profile["stability_hot_change_pct"]
|
||
).clip(lower=0) * profile["stability_hot_change_penalty_slope"]
|
||
|
||
if "turnover_rate" in df.columns:
|
||
turnover = pd.to_numeric(df["turnover_rate"], errors="coerce").fillna(0)
|
||
score -= (
|
||
turnover - profile["stability_high_turnover_rate"]
|
||
).clip(lower=0) * profile["stability_high_turnover_penalty_slope"]
|
||
|
||
if "volume_ratio" in df.columns:
|
||
volume_ratio = pd.to_numeric(df["volume_ratio"], errors="coerce").fillna(1)
|
||
score -= (
|
||
volume_ratio - profile["stability_high_volume_ratio"]
|
||
).clip(lower=0) * profile["stability_high_volume_ratio_penalty_slope"]
|
||
|
||
if "pe_ratio" in df.columns:
|
||
pe = pd.to_numeric(df["pe_ratio"], errors="coerce")
|
||
score = score.where((pe.isna()) | (pe > 0), score - profile["stability_invalid_pe_penalty"])
|
||
|
||
if "signal_score" in df.columns:
|
||
signal = pd.to_numeric(df["signal_score"], errors="coerce").fillna(50)
|
||
score = score + (signal - 50) * 0.12
|
||
|
||
if "volatility_20d_pct" in df.columns:
|
||
volatility = pd.to_numeric(df["volatility_20d_pct"], errors="coerce")
|
||
score -= (
|
||
volatility - profile["stability_high_volatility_pct"]
|
||
).clip(lower=0).fillna(0) * profile["stability_high_volatility_penalty_slope"]
|
||
|
||
if "max_drawdown_20d_pct" in df.columns:
|
||
drawdown = pd.to_numeric(df["max_drawdown_20d_pct"], errors="coerce")
|
||
score -= (
|
||
profile["stability_max_drawdown_floor_pct"] - drawdown
|
||
).clip(lower=0).fillna(0) * profile["stability_drawdown_penalty_slope"]
|
||
|
||
if "atr_20_pct" in df.columns:
|
||
atr = pd.to_numeric(df["atr_20_pct"], errors="coerce")
|
||
score -= (
|
||
atr - profile["stability_high_atr_pct"]
|
||
).clip(lower=0).fillna(0) * profile["stability_high_atr_penalty_slope"]
|
||
|
||
if "daily_quality_score" in df.columns:
|
||
quality = pd.to_numeric(df["daily_quality_score"], errors="coerce")
|
||
score -= (
|
||
profile["stability_low_daily_quality_score"] - quality
|
||
).clip(lower=0).fillna(0) * profile["stability_low_daily_quality_penalty_slope"]
|
||
|
||
if "daily_quality_flags" in df.columns:
|
||
flags = df["daily_quality_flags"].fillna("").astype(str)
|
||
severe_flags = flags.str.contains("invalid_ohlc|non_positive_price|negative_volume|stale_cache")
|
||
score -= severe_flags.astype(float) * profile["stability_bad_daily_quality_flag_penalty"]
|
||
|
||
return score.clip(0, 100)
|
||
|
||
|
||
def _compute_size_score(df: pd.DataFrame) -> pd.Series:
|
||
if "total_mv" not in df.columns:
|
||
return pd.Series(50.0, index=df.index)
|
||
|
||
import numpy as np
|
||
|
||
mv = pd.to_numeric(df["total_mv"], errors="coerce")
|
||
log_mv = np.log10(mv.clip(lower=1))
|
||
return _rank_score(log_mv.where(mv > 0), lower_is_better=False, na_score=35)
|
||
|
||
|
||
def _compute_theme_heat_score(df: pd.DataFrame, profile: dict[str, float]) -> pd.Series:
|
||
base = pd.Series(profile["theme_heat_unknown_score"], index=df.index)
|
||
if "board_heat_score" in df.columns:
|
||
score = pd.to_numeric(df["board_heat_score"], errors="coerce").fillna(base)
|
||
elif "industry_heat_score" in df.columns or "concept_heat_score" in df.columns:
|
||
industry = _numeric_column(df, "industry_heat_score")
|
||
concept = _numeric_column(df, "concept_heat_score")
|
||
score = pd.concat([industry, concept], axis=1).max(axis=1).fillna(base)
|
||
elif "industry_change_pct" in df.columns:
|
||
change = pd.to_numeric(df["industry_change_pct"], errors="coerce").fillna(0)
|
||
score = base + change * profile["theme_heat_change_slope"]
|
||
if "industry_rank" in df.columns:
|
||
rank = pd.to_numeric(df["industry_rank"], errors="coerce")
|
||
score += (
|
||
(profile["theme_heat_rank_bonus"] - rank.clip(lower=1, upper=10))
|
||
.clip(lower=0)
|
||
.fillna(0)
|
||
)
|
||
else:
|
||
return base.clip(0, 100)
|
||
|
||
if "board_heat_trend_score" in df.columns:
|
||
trend = pd.to_numeric(df["board_heat_trend_score"], errors="coerce").fillna(0)
|
||
if "board_heat_observations" in df.columns:
|
||
observations = pd.to_numeric(df["board_heat_observations"], errors="coerce").fillna(0)
|
||
else:
|
||
observations = pd.Series(profile["theme_heat_trend_min_observations"], index=df.index)
|
||
trend_is_reliable = observations >= profile["theme_heat_trend_min_observations"]
|
||
trend_bonus = (trend.clip(lower=0) * profile["theme_heat_trend_slope"]).clip(
|
||
upper=profile["theme_heat_trend_bonus_cap"]
|
||
)
|
||
cooling_penalty = ((-trend).clip(lower=0) * profile["theme_heat_cooling_penalty_slope"]).clip(
|
||
upper=profile["theme_heat_cooling_penalty_cap"]
|
||
)
|
||
score = score + (trend_bonus - cooling_penalty).where(trend_is_reliable, 0)
|
||
|
||
if "board_heat_persistence_score" in df.columns:
|
||
persistence = pd.to_numeric(df["board_heat_persistence_score"], errors="coerce").fillna(0)
|
||
persistence_bonus = (
|
||
(persistence - profile["theme_heat_persistence_min_score"]).clip(lower=0)
|
||
* profile["theme_heat_persistence_slope"]
|
||
).clip(upper=profile["theme_heat_persistence_bonus_cap"])
|
||
score = score + persistence_bonus
|
||
|
||
if "board_heat_cooling_score" in df.columns:
|
||
cooling = pd.to_numeric(df["board_heat_cooling_score"], errors="coerce").fillna(0)
|
||
cooling_penalty = (cooling * profile["theme_heat_cooling_score_penalty_slope"]).clip(
|
||
upper=profile["theme_heat_cooling_score_penalty_cap"]
|
||
)
|
||
score = score - cooling_penalty
|
||
|
||
overheat = (score - profile["theme_heat_overheat_score"]).clip(lower=0)
|
||
score = score - overheat * profile["theme_heat_overheat_penalty_slope"]
|
||
return score.clip(0, 100)
|
||
|
||
|
||
def _compute_topic_alignment_score(df: pd.DataFrame, profile: dict[str, float]) -> pd.Series:
|
||
"""Score whether industry/concept labels align with hotspot route summaries."""
|
||
base = pd.Series(profile["topic_alignment_unknown_score"], index=df.index)
|
||
if not {"industry", "concepts", "board_heat_summary"} & set(df.columns):
|
||
return base
|
||
|
||
scores = []
|
||
for _, row in df.iterrows():
|
||
candidate_topics = _topic_tokens(row.get("industry")) | _topic_tokens(row.get("concepts"))
|
||
route_topics = _topic_tokens(row.get("board_heat_summary"))
|
||
if not candidate_topics or not route_topics:
|
||
scores.append(float(profile["topic_alignment_unknown_score"]))
|
||
continue
|
||
overlap = candidate_topics & route_topics
|
||
score = float(profile["topic_alignment_unknown_score"])
|
||
if overlap:
|
||
score += float(profile["topic_alignment_match_bonus"])
|
||
heat = pd.to_numeric(row.get("board_heat_score"), errors="coerce")
|
||
if pd.notna(heat):
|
||
score += max(float(heat) - 50.0, 0.0) * float(profile["topic_alignment_heat_weight"])
|
||
else:
|
||
score -= float(profile["topic_alignment_unmatched_penalty"])
|
||
scores.append(score)
|
||
return pd.Series(scores, index=df.index).clip(0, 100)
|
||
|
||
|
||
def _topic_tokens(value: object) -> set[str]:
|
||
text = str(value or "").strip()
|
||
if not text or text.lower() in {"nan", "none", "<na>"}:
|
||
return set()
|
||
normalized = text
|
||
for sep in ["|", ",", ",", ";", ";", "/"]:
|
||
normalized = normalized.replace(sep, " ")
|
||
tokens = set()
|
||
for raw in normalized.split():
|
||
token = raw.split(":", 1)[0].strip()
|
||
if token and token.lower() not in {"rank", "nan", "none", "<na>"}:
|
||
tokens.add(token)
|
||
return tokens
|
||
|
||
|
||
def _numeric_column(df: pd.DataFrame, column: str) -> pd.Series:
|
||
if column not in df.columns:
|
||
return pd.Series(pd.NA, index=df.index)
|
||
return pd.to_numeric(df[column], errors="coerce")
|
||
|
||
|
||
def _rank_score(
|
||
series: pd.Series,
|
||
*,
|
||
lower_is_better: bool,
|
||
na_score: float = 50.0,
|
||
) -> pd.Series:
|
||
numeric = pd.to_numeric(series, errors="coerce")
|
||
if numeric.notna().sum() == 0:
|
||
return pd.Series(float(na_score), index=series.index)
|
||
|
||
ranks = numeric.rank(
|
||
ascending=not lower_is_better,
|
||
na_option="keep",
|
||
pct=True,
|
||
) * 100
|
||
return ranks.fillna(float(na_score)).clip(0, 100)
|