fa45d8aa5f
- health_checklist.json: 192.168.1.122→node122
- ocr_client.py: docstring IP→node122
- docs/market-data-requirements.md: IP→node122
- 所有API调用通过ProxyHandler({})绕过系统代理
Privoxy对node122:18003返回500,直连正常
463 lines
14 KiB
Python
463 lines
14 KiB
Python
# -*- coding:utf-8 -*-
|
|
# !/usr/bin/env python
|
|
"""
|
|
Date: 2026/4/12 17:00
|
|
Desc: 百度股市通-经济数据
|
|
https://finance.baidu.com/calendar
|
|
"""
|
|
|
|
import math
|
|
import re
|
|
|
|
import pandas as pd
|
|
from curl_cffi import requests
|
|
|
|
|
|
def _get_baidu_cookie(headers: dict) -> str:
|
|
"""
|
|
安全获取百度股市通所需的 Cookie
|
|
:param headers: 基础请求头
|
|
:return: 格式化的 Cookie字符串
|
|
:raises ValueError: 当无法获取必要Cookie时
|
|
:raises ConnectionError: 网络请求失败时
|
|
"""
|
|
try:
|
|
# 使用Session保持Cookie上下文
|
|
with requests.Session() as session:
|
|
session.headers.update(headers)
|
|
|
|
# 第一步:获取基础Cookie (BAIDUID系列)
|
|
resp1 = session.get(
|
|
"https://finance.baidu.com/calendar",
|
|
impersonate="chrome110",
|
|
timeout=10,
|
|
)
|
|
resp1.raise_for_status()
|
|
|
|
# 验证必要Cookie
|
|
baiduid = resp1.cookies.get("BAIDUID")
|
|
baiduid_bfess = resp1.cookies.get("BAIDUID_BFESS")
|
|
if not all([baiduid, baiduid_bfess]):
|
|
raise ValueError("Missing BAIDUID cookies in first response")
|
|
|
|
# 第二步:提取并请求hm.js
|
|
hm_pattern = r"https://hm\.baidu\.com/hm\.js\?\w+"
|
|
hm_match = re.search(hm_pattern, resp1.text)
|
|
if not hm_match:
|
|
# 尝试备用正则模式
|
|
hm_match = re.search(r"//hm\.baidu\.com/hm\.js\?\w+", resp1.text)
|
|
if not hm_match:
|
|
raise ValueError("Failed to extract hm.js URL from response")
|
|
|
|
hm_url = (
|
|
"https:" + hm_match.group()
|
|
if hm_match.group().startswith("//")
|
|
else hm_match.group()
|
|
)
|
|
|
|
# 第二步请求 (自动携带第一步的Cookie)
|
|
resp2 = session.get(hm_url, impersonate="chrome110", timeout=10)
|
|
resp2.raise_for_status()
|
|
|
|
# 验证必要Cookie
|
|
hmac_count = resp2.cookies.get("HMACCOUNT")
|
|
hmac_count_bfess = resp2.cookies.get("HMACCOUNT_BFESS")
|
|
if not all([hmac_count, hmac_count_bfess]):
|
|
raise ValueError("Missing HMACCOUNT cookies in second response")
|
|
|
|
# 安全拼接Cookie
|
|
return (
|
|
f"BAIDUID={baiduid}; "
|
|
f"BAIDUID_BFESS={baiduid_bfess}; "
|
|
f"HMACCOUNT={hmac_count}; "
|
|
f"HMACCOUNT_BFESS={hmac_count_bfess}"
|
|
)
|
|
|
|
except requests.exceptions.RequestException as e:
|
|
raise ConnectionError(f"Network request failed: {str(e)}") from e
|
|
except re.error as e:
|
|
raise ValueError(f"Regex pattern error: {str(e)}") from e
|
|
|
|
|
|
def _baidu_finance_calendar(
|
|
date: str, cate: str, process_func, cookie: str = None
|
|
) -> pd.DataFrame:
|
|
"""
|
|
百度股市通日历数据基础函数(支持分页)
|
|
:param date: 查询日期 (格式: YYYYMMDD)
|
|
:param cate: 数据类别 ("economic_data" 或 "notify_suspend")
|
|
:param process_func: 数据处理函数
|
|
:param cookie: cookie
|
|
:return: 处理后的DataFrame
|
|
"""
|
|
# 日期格式转换
|
|
formatted_date = "-".join([date[:4], date[4:6], date[6:]])
|
|
|
|
# 构建请求参数
|
|
base_params = {
|
|
"start_date": formatted_date,
|
|
"end_date": formatted_date,
|
|
"pn": "0",
|
|
"rn": "100", # 每页100条
|
|
"cate": cate,
|
|
"finClientType": "pc",
|
|
}
|
|
|
|
# 构建请求头
|
|
headers = {
|
|
"accept": "application/vnd.finance-web.v1+json",
|
|
"accept-encoding": "gzip, deflate, br, zstd",
|
|
"accept-language": "en,zh-CN;q=0.9,zh;q=0.8",
|
|
"cache-control": "no-cache",
|
|
"origin": "https://finance.baidu.com",
|
|
"pragma": "no-cache",
|
|
"priority": "u=1, i",
|
|
"referer": "https://finance.baidu.com/",
|
|
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
"Chrome/142.0.0.0 Safari/537.36",
|
|
}
|
|
|
|
# 在_baidu_finance_calendar函数中替换原代码块
|
|
if cookie is None:
|
|
try:
|
|
cookie = _get_baidu_cookie(headers.copy()) # 保护原始headers
|
|
except Exception as e:
|
|
# 可降级处理或保留原始行为
|
|
raise RuntimeError(f"Failed to obtain Baidu cookies: {str(e)}") from e
|
|
headers["cookie"] = cookie
|
|
|
|
url = "https://finance.pae.baidu.com/sapi/v1/financecalendar"
|
|
big_df = pd.DataFrame()
|
|
|
|
# 获取指定日期的总记录数
|
|
target_date = formatted_date
|
|
total_records = 0
|
|
|
|
# 第一次请求
|
|
params = base_params.copy()
|
|
response = requests.get(
|
|
url=url, params=params, headers=headers, impersonate="chrome110"
|
|
)
|
|
response.raise_for_status()
|
|
data_json = response.json()
|
|
|
|
# 从JSON中提取指定日期的总记录数
|
|
if "Result" in data_json and "calendarInfo" in data_json["Result"]:
|
|
calendar_info = data_json["Result"]["calendarInfo"]
|
|
|
|
# 查找目标日期的记录
|
|
for item in calendar_info:
|
|
if item.get("date") == target_date:
|
|
total_records = item.get("total", 0)
|
|
break
|
|
|
|
# 计算总页数 (每页100条)
|
|
total_pages = math.ceil(total_records / 100) if total_records > 0 else 1
|
|
|
|
# 处理所有页码
|
|
for page in range(total_pages):
|
|
if page > 0: # 第一页已在前面获取
|
|
params = base_params.copy()
|
|
params["pn"] = str(page)
|
|
response = requests.get(url=url, params=params, headers=headers)
|
|
response.raise_for_status()
|
|
data_json = response.json()
|
|
|
|
# 提取并处理指定日期的数据
|
|
if "Result" in data_json and "calendarInfo" in data_json["Result"]:
|
|
for item in data_json["Result"]["calendarInfo"]:
|
|
if item.get("date") == target_date and item.get("list"):
|
|
processed_df = process_func(item["list"])
|
|
big_df = pd.concat([big_df, processed_df], ignore_index=True)
|
|
|
|
return big_df
|
|
|
|
|
|
def _process_economic_data(data_list: list) -> pd.DataFrame:
|
|
"""处理经济数据"""
|
|
if not data_list:
|
|
return pd.DataFrame()
|
|
temp_df = pd.DataFrame(data_list)
|
|
rename_dict = {
|
|
"date": "日期",
|
|
"time": "时间",
|
|
"title": "事件",
|
|
"star": "重要性",
|
|
"formerVal": "前值",
|
|
"pubVal": "公布",
|
|
"region": "地区",
|
|
"indicateVal": "预期",
|
|
"country": "国家",
|
|
"timePeriod": "统计周期",
|
|
}
|
|
temp_df.rename(columns=rename_dict, inplace=True)
|
|
required_cols = ["公布", "预期", "前值", "重要性"]
|
|
for col in required_cols:
|
|
if col not in temp_df.columns:
|
|
temp_df[col] = None
|
|
available_cols = []
|
|
for col in [
|
|
"日期",
|
|
"时间",
|
|
"国家",
|
|
"地区",
|
|
"事件",
|
|
"统计周期",
|
|
"公布",
|
|
"预期",
|
|
"前值",
|
|
"重要性",
|
|
]:
|
|
if col in temp_df.columns:
|
|
available_cols.append(col)
|
|
if available_cols:
|
|
temp_df = temp_df[available_cols]
|
|
for col in ["公布", "预期", "前值", "重要性"]:
|
|
if col in temp_df.columns:
|
|
temp_df[col] = pd.to_numeric(temp_df[col], errors="coerce")
|
|
if "日期" in temp_df.columns:
|
|
temp_df["日期"] = pd.to_datetime(temp_df["日期"], errors="coerce").dt.date
|
|
|
|
return temp_df
|
|
|
|
|
|
def _process_suspend_data(data_list: list) -> pd.DataFrame:
|
|
"""处理停复牌数据 - 根据实际JSON结构精确修正"""
|
|
if not data_list:
|
|
return pd.DataFrame()
|
|
temp_df = pd.DataFrame(data_list)
|
|
rename_dict = {
|
|
"code": "股票代码",
|
|
"name": "股票简称",
|
|
"exchange": "交易所代码",
|
|
"start": "停牌时间",
|
|
"reason": "停牌事项说明",
|
|
"marketValue": "市值",
|
|
"date": "公告日期",
|
|
"time": "公告时间",
|
|
"type": "证券类型",
|
|
"market": "市场类型",
|
|
"isSkip": "是否跳过",
|
|
"end": "复牌时间",
|
|
}
|
|
temp_df.rename(columns=rename_dict, inplace=True)
|
|
if "复牌时间" not in temp_df.columns:
|
|
temp_df["复牌时间"] = "-"
|
|
temp_df = temp_df[
|
|
[
|
|
"股票代码",
|
|
"股票简称",
|
|
"交易所代码",
|
|
"停牌时间",
|
|
"复牌时间",
|
|
"停牌事项说明",
|
|
"市值",
|
|
"公告日期",
|
|
"公告时间",
|
|
"证券类型",
|
|
"市场类型",
|
|
"是否跳过",
|
|
]
|
|
]
|
|
return temp_df
|
|
|
|
|
|
def news_economic_baidu(date: str = "20251126", cookie: str = None) -> pd.DataFrame:
|
|
"""
|
|
百度股市通-经济数据
|
|
https://finance.baidu.com/calendar
|
|
:param date: 查询日期 (格式: YYYYMMDD)
|
|
:param cookie: cookie
|
|
:return: 经济数据 pd.DataFrame
|
|
"""
|
|
return _baidu_finance_calendar(
|
|
date=date,
|
|
cate="economic_data",
|
|
process_func=_process_economic_data,
|
|
cookie=cookie,
|
|
)
|
|
|
|
|
|
def news_trade_notify_suspend_baidu(
|
|
date: str = "20251126", cookie: str = None
|
|
) -> pd.DataFrame:
|
|
"""
|
|
百度股市通-交易提醒-停复牌
|
|
https://finance.baidu.com/calendar
|
|
:param date: 查询日期 (格式: YYYYMMDD)
|
|
:param cookie: cookie
|
|
:return: 停复牌数据DataFrame
|
|
"""
|
|
return _baidu_finance_calendar(
|
|
date=date,
|
|
cate="notify_suspend",
|
|
process_func=_process_suspend_data,
|
|
cookie=cookie,
|
|
)
|
|
|
|
|
|
def _process_dividend_data(data_list: list) -> pd.DataFrame:
|
|
"""处理分红派息数据"""
|
|
if not data_list:
|
|
return pd.DataFrame()
|
|
|
|
temp_df = pd.DataFrame(data_list)
|
|
|
|
# 字段映射
|
|
rename_dict = {
|
|
"code": "股票代码",
|
|
"market": "-", # 这个字段在最终结果中会被删除
|
|
"exchange": "交易所",
|
|
"name": "股票简称",
|
|
"diviDate": "除权日",
|
|
"date": "报告期",
|
|
"diviCash": "分红",
|
|
"shareDivide": "送股",
|
|
"transfer": "转增",
|
|
"physical": "实物",
|
|
}
|
|
temp_df.rename(columns=rename_dict, inplace=True)
|
|
|
|
# 确保必要列存在
|
|
if "分红" not in temp_df.columns:
|
|
temp_df["分红"] = "-"
|
|
if "实物" not in temp_df.columns:
|
|
temp_df["实物"] = "-"
|
|
if "送股" not in temp_df.columns:
|
|
temp_df["送股"] = "-"
|
|
if "转增" not in temp_df.columns:
|
|
temp_df["转增"] = "-"
|
|
|
|
# 选择需要的列
|
|
temp_df = temp_df[
|
|
[
|
|
"股票代码",
|
|
"除权日",
|
|
"分红",
|
|
"送股",
|
|
"转增",
|
|
"实物",
|
|
"交易所",
|
|
"股票简称",
|
|
"报告期",
|
|
]
|
|
]
|
|
|
|
# 日期格式转换
|
|
if "除权日" in temp_df.columns:
|
|
temp_df["除权日"] = pd.to_datetime(temp_df["除权日"], errors="coerce").dt.date
|
|
if "报告期" in temp_df.columns:
|
|
temp_df["报告期"] = pd.to_datetime(temp_df["报告期"], errors="coerce").dt.date
|
|
|
|
return temp_df
|
|
|
|
|
|
def news_trade_notify_dividend_baidu(
|
|
date: str = "20251126", cookie: str = None
|
|
) -> pd.DataFrame:
|
|
"""
|
|
百度股市通-交易提醒-分红派息
|
|
https://finance.baidu.com/calendar
|
|
:param date: 查询日期 (格式: YYYYMMDD)
|
|
:param cookie: cookie
|
|
:return: 交易提醒-分红派息DataFrame
|
|
"""
|
|
return _baidu_finance_calendar(
|
|
date=date,
|
|
cate="notify_divide",
|
|
process_func=_process_dividend_data,
|
|
cookie=cookie,
|
|
)
|
|
|
|
|
|
def _process_report_data(data_list: list) -> pd.DataFrame:
|
|
"""处理财报发行数据 - 根据实际JSON结构精确修正"""
|
|
if not data_list:
|
|
return pd.DataFrame()
|
|
|
|
# 创建DataFrame
|
|
temp_df = pd.DataFrame(data_list)
|
|
|
|
# 精确字段映射 (根据提供的JSON结构)
|
|
rename_dict = {
|
|
"code": "股票代码",
|
|
"name": "股票简称",
|
|
"exchange": "交易所",
|
|
"reportType": "财报类型",
|
|
"time": "发布时间",
|
|
"marketValue": "市值",
|
|
"capitalization": "总市值",
|
|
"date": "发布日期",
|
|
}
|
|
temp_df.rename(columns=rename_dict, inplace=True)
|
|
|
|
# 确保必要列存在
|
|
if "财报类型" not in temp_df.columns:
|
|
temp_df["财报类型"] = "-"
|
|
if "发布时间" not in temp_df.columns:
|
|
temp_df["发布时间"] = "-"
|
|
if "市值" not in temp_df.columns and "总市值" in temp_df.columns:
|
|
temp_df["市值"] = temp_df["总市值"]
|
|
|
|
# 选择并排序列
|
|
available_cols = []
|
|
for col in [
|
|
"股票代码",
|
|
"股票简称",
|
|
"交易所",
|
|
"财报类型",
|
|
"发布时间",
|
|
"市值",
|
|
"发布日期",
|
|
]:
|
|
if col in temp_df.columns:
|
|
available_cols.append(col)
|
|
|
|
if available_cols:
|
|
temp_df = temp_df[available_cols]
|
|
else:
|
|
# 如果没有匹配的列,返回空DataFrame
|
|
return pd.DataFrame()
|
|
|
|
# 类型转换
|
|
if "市值" in temp_df.columns:
|
|
temp_df["市值"] = pd.to_numeric(temp_df["市值"], errors="coerce")
|
|
|
|
if "发布日期" in temp_df.columns:
|
|
temp_df["发布日期"] = pd.to_datetime(
|
|
temp_df["发布日期"], errors="coerce"
|
|
).dt.date
|
|
|
|
return temp_df
|
|
|
|
|
|
def news_report_time_baidu(date: str = "20251126", cookie: str = None) -> pd.DataFrame:
|
|
"""
|
|
百度股市通-财报发行
|
|
https://finance.baidu.com/calendar
|
|
:param date: 查询日期 (格式: YYYYMMDD)
|
|
:param cookie: cookie
|
|
:return: 财报发行DataFrame
|
|
"""
|
|
return _baidu_finance_calendar(
|
|
date=date, cate="report_time", process_func=_process_report_data, cookie=cookie
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
news_economic_baidu_df = news_economic_baidu(date="20251126")
|
|
print(news_economic_baidu_df)
|
|
|
|
news_trade_notify_suspend_baidu_df = news_trade_notify_suspend_baidu(
|
|
date="20251126"
|
|
)
|
|
print(news_trade_notify_suspend_baidu_df)
|
|
|
|
news_trade_notify_dividend_baidu_df = news_trade_notify_dividend_baidu(
|
|
date="20251126"
|
|
)
|
|
print(news_trade_notify_dividend_baidu_df)
|
|
|
|
news_report_time_baidu_df = news_report_time_baidu(date="20251126")
|
|
print(news_report_time_baidu_df)
|