Files
MoFin/venv/lib/python3.12/site-packages/akshare/news/news_baidu.py
T
知微 fa45d8aa5f fix: 小果地址统一node122(兼容LAN+EasyTier)
- health_checklist.json: 192.168.1.122→node122
- ocr_client.py: docstring IP→node122
- docs/market-data-requirements.md: IP→node122
- 所有API调用通过ProxyHandler({})绕过系统代理
  Privoxy对node122:18003返回500,直连正常
2026-06-30 02:56:35 +08:00

463 lines
14 KiB
Python

# -*- coding:utf-8 -*-
# !/usr/bin/env python
"""
Date: 2026/4/12 17:00
Desc: 百度股市通-经济数据
https://finance.baidu.com/calendar
"""
import math
import re
import pandas as pd
from curl_cffi import requests
def _get_baidu_cookie(headers: dict) -> str:
"""
安全获取百度股市通所需的 Cookie
:param headers: 基础请求头
:return: 格式化的 Cookie字符串
:raises ValueError: 当无法获取必要Cookie时
:raises ConnectionError: 网络请求失败时
"""
try:
# 使用Session保持Cookie上下文
with requests.Session() as session:
session.headers.update(headers)
# 第一步:获取基础Cookie (BAIDUID系列)
resp1 = session.get(
"https://finance.baidu.com/calendar",
impersonate="chrome110",
timeout=10,
)
resp1.raise_for_status()
# 验证必要Cookie
baiduid = resp1.cookies.get("BAIDUID")
baiduid_bfess = resp1.cookies.get("BAIDUID_BFESS")
if not all([baiduid, baiduid_bfess]):
raise ValueError("Missing BAIDUID cookies in first response")
# 第二步:提取并请求hm.js
hm_pattern = r"https://hm\.baidu\.com/hm\.js\?\w+"
hm_match = re.search(hm_pattern, resp1.text)
if not hm_match:
# 尝试备用正则模式
hm_match = re.search(r"//hm\.baidu\.com/hm\.js\?\w+", resp1.text)
if not hm_match:
raise ValueError("Failed to extract hm.js URL from response")
hm_url = (
"https:" + hm_match.group()
if hm_match.group().startswith("//")
else hm_match.group()
)
# 第二步请求 (自动携带第一步的Cookie)
resp2 = session.get(hm_url, impersonate="chrome110", timeout=10)
resp2.raise_for_status()
# 验证必要Cookie
hmac_count = resp2.cookies.get("HMACCOUNT")
hmac_count_bfess = resp2.cookies.get("HMACCOUNT_BFESS")
if not all([hmac_count, hmac_count_bfess]):
raise ValueError("Missing HMACCOUNT cookies in second response")
# 安全拼接Cookie
return (
f"BAIDUID={baiduid}; "
f"BAIDUID_BFESS={baiduid_bfess}; "
f"HMACCOUNT={hmac_count}; "
f"HMACCOUNT_BFESS={hmac_count_bfess}"
)
except requests.exceptions.RequestException as e:
raise ConnectionError(f"Network request failed: {str(e)}") from e
except re.error as e:
raise ValueError(f"Regex pattern error: {str(e)}") from e
def _baidu_finance_calendar(
date: str, cate: str, process_func, cookie: str = None
) -> pd.DataFrame:
"""
百度股市通日历数据基础函数(支持分页)
:param date: 查询日期 (格式: YYYYMMDD)
:param cate: 数据类别 ("economic_data""notify_suspend")
:param process_func: 数据处理函数
:param cookie: cookie
:return: 处理后的DataFrame
"""
# 日期格式转换
formatted_date = "-".join([date[:4], date[4:6], date[6:]])
# 构建请求参数
base_params = {
"start_date": formatted_date,
"end_date": formatted_date,
"pn": "0",
"rn": "100", # 每页100条
"cate": cate,
"finClientType": "pc",
}
# 构建请求头
headers = {
"accept": "application/vnd.finance-web.v1+json",
"accept-encoding": "gzip, deflate, br, zstd",
"accept-language": "en,zh-CN;q=0.9,zh;q=0.8",
"cache-control": "no-cache",
"origin": "https://finance.baidu.com",
"pragma": "no-cache",
"priority": "u=1, i",
"referer": "https://finance.baidu.com/",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/142.0.0.0 Safari/537.36",
}
# 在_baidu_finance_calendar函数中替换原代码块
if cookie is None:
try:
cookie = _get_baidu_cookie(headers.copy()) # 保护原始headers
except Exception as e:
# 可降级处理或保留原始行为
raise RuntimeError(f"Failed to obtain Baidu cookies: {str(e)}") from e
headers["cookie"] = cookie
url = "https://finance.pae.baidu.com/sapi/v1/financecalendar"
big_df = pd.DataFrame()
# 获取指定日期的总记录数
target_date = formatted_date
total_records = 0
# 第一次请求
params = base_params.copy()
response = requests.get(
url=url, params=params, headers=headers, impersonate="chrome110"
)
response.raise_for_status()
data_json = response.json()
# 从JSON中提取指定日期的总记录数
if "Result" in data_json and "calendarInfo" in data_json["Result"]:
calendar_info = data_json["Result"]["calendarInfo"]
# 查找目标日期的记录
for item in calendar_info:
if item.get("date") == target_date:
total_records = item.get("total", 0)
break
# 计算总页数 (每页100条)
total_pages = math.ceil(total_records / 100) if total_records > 0 else 1
# 处理所有页码
for page in range(total_pages):
if page > 0: # 第一页已在前面获取
params = base_params.copy()
params["pn"] = str(page)
response = requests.get(url=url, params=params, headers=headers)
response.raise_for_status()
data_json = response.json()
# 提取并处理指定日期的数据
if "Result" in data_json and "calendarInfo" in data_json["Result"]:
for item in data_json["Result"]["calendarInfo"]:
if item.get("date") == target_date and item.get("list"):
processed_df = process_func(item["list"])
big_df = pd.concat([big_df, processed_df], ignore_index=True)
return big_df
def _process_economic_data(data_list: list) -> pd.DataFrame:
"""处理经济数据"""
if not data_list:
return pd.DataFrame()
temp_df = pd.DataFrame(data_list)
rename_dict = {
"date": "日期",
"time": "时间",
"title": "事件",
"star": "重要性",
"formerVal": "前值",
"pubVal": "公布",
"region": "地区",
"indicateVal": "预期",
"country": "国家",
"timePeriod": "统计周期",
}
temp_df.rename(columns=rename_dict, inplace=True)
required_cols = ["公布", "预期", "前值", "重要性"]
for col in required_cols:
if col not in temp_df.columns:
temp_df[col] = None
available_cols = []
for col in [
"日期",
"时间",
"国家",
"地区",
"事件",
"统计周期",
"公布",
"预期",
"前值",
"重要性",
]:
if col in temp_df.columns:
available_cols.append(col)
if available_cols:
temp_df = temp_df[available_cols]
for col in ["公布", "预期", "前值", "重要性"]:
if col in temp_df.columns:
temp_df[col] = pd.to_numeric(temp_df[col], errors="coerce")
if "日期" in temp_df.columns:
temp_df["日期"] = pd.to_datetime(temp_df["日期"], errors="coerce").dt.date
return temp_df
def _process_suspend_data(data_list: list) -> pd.DataFrame:
"""处理停复牌数据 - 根据实际JSON结构精确修正"""
if not data_list:
return pd.DataFrame()
temp_df = pd.DataFrame(data_list)
rename_dict = {
"code": "股票代码",
"name": "股票简称",
"exchange": "交易所代码",
"start": "停牌时间",
"reason": "停牌事项说明",
"marketValue": "市值",
"date": "公告日期",
"time": "公告时间",
"type": "证券类型",
"market": "市场类型",
"isSkip": "是否跳过",
"end": "复牌时间",
}
temp_df.rename(columns=rename_dict, inplace=True)
if "复牌时间" not in temp_df.columns:
temp_df["复牌时间"] = "-"
temp_df = temp_df[
[
"股票代码",
"股票简称",
"交易所代码",
"停牌时间",
"复牌时间",
"停牌事项说明",
"市值",
"公告日期",
"公告时间",
"证券类型",
"市场类型",
"是否跳过",
]
]
return temp_df
def news_economic_baidu(date: str = "20251126", cookie: str = None) -> pd.DataFrame:
"""
百度股市通-经济数据
https://finance.baidu.com/calendar
:param date: 查询日期 (格式: YYYYMMDD)
:param cookie: cookie
:return: 经济数据 pd.DataFrame
"""
return _baidu_finance_calendar(
date=date,
cate="economic_data",
process_func=_process_economic_data,
cookie=cookie,
)
def news_trade_notify_suspend_baidu(
date: str = "20251126", cookie: str = None
) -> pd.DataFrame:
"""
百度股市通-交易提醒-停复牌
https://finance.baidu.com/calendar
:param date: 查询日期 (格式: YYYYMMDD)
:param cookie: cookie
:return: 停复牌数据DataFrame
"""
return _baidu_finance_calendar(
date=date,
cate="notify_suspend",
process_func=_process_suspend_data,
cookie=cookie,
)
def _process_dividend_data(data_list: list) -> pd.DataFrame:
"""处理分红派息数据"""
if not data_list:
return pd.DataFrame()
temp_df = pd.DataFrame(data_list)
# 字段映射
rename_dict = {
"code": "股票代码",
"market": "-", # 这个字段在最终结果中会被删除
"exchange": "交易所",
"name": "股票简称",
"diviDate": "除权日",
"date": "报告期",
"diviCash": "分红",
"shareDivide": "送股",
"transfer": "转增",
"physical": "实物",
}
temp_df.rename(columns=rename_dict, inplace=True)
# 确保必要列存在
if "分红" not in temp_df.columns:
temp_df["分红"] = "-"
if "实物" not in temp_df.columns:
temp_df["实物"] = "-"
if "送股" not in temp_df.columns:
temp_df["送股"] = "-"
if "转增" not in temp_df.columns:
temp_df["转增"] = "-"
# 选择需要的列
temp_df = temp_df[
[
"股票代码",
"除权日",
"分红",
"送股",
"转增",
"实物",
"交易所",
"股票简称",
"报告期",
]
]
# 日期格式转换
if "除权日" in temp_df.columns:
temp_df["除权日"] = pd.to_datetime(temp_df["除权日"], errors="coerce").dt.date
if "报告期" in temp_df.columns:
temp_df["报告期"] = pd.to_datetime(temp_df["报告期"], errors="coerce").dt.date
return temp_df
def news_trade_notify_dividend_baidu(
date: str = "20251126", cookie: str = None
) -> pd.DataFrame:
"""
百度股市通-交易提醒-分红派息
https://finance.baidu.com/calendar
:param date: 查询日期 (格式: YYYYMMDD)
:param cookie: cookie
:return: 交易提醒-分红派息DataFrame
"""
return _baidu_finance_calendar(
date=date,
cate="notify_divide",
process_func=_process_dividend_data,
cookie=cookie,
)
def _process_report_data(data_list: list) -> pd.DataFrame:
"""处理财报发行数据 - 根据实际JSON结构精确修正"""
if not data_list:
return pd.DataFrame()
# 创建DataFrame
temp_df = pd.DataFrame(data_list)
# 精确字段映射 (根据提供的JSON结构)
rename_dict = {
"code": "股票代码",
"name": "股票简称",
"exchange": "交易所",
"reportType": "财报类型",
"time": "发布时间",
"marketValue": "市值",
"capitalization": "总市值",
"date": "发布日期",
}
temp_df.rename(columns=rename_dict, inplace=True)
# 确保必要列存在
if "财报类型" not in temp_df.columns:
temp_df["财报类型"] = "-"
if "发布时间" not in temp_df.columns:
temp_df["发布时间"] = "-"
if "市值" not in temp_df.columns and "总市值" in temp_df.columns:
temp_df["市值"] = temp_df["总市值"]
# 选择并排序列
available_cols = []
for col in [
"股票代码",
"股票简称",
"交易所",
"财报类型",
"发布时间",
"市值",
"发布日期",
]:
if col in temp_df.columns:
available_cols.append(col)
if available_cols:
temp_df = temp_df[available_cols]
else:
# 如果没有匹配的列,返回空DataFrame
return pd.DataFrame()
# 类型转换
if "市值" in temp_df.columns:
temp_df["市值"] = pd.to_numeric(temp_df["市值"], errors="coerce")
if "发布日期" in temp_df.columns:
temp_df["发布日期"] = pd.to_datetime(
temp_df["发布日期"], errors="coerce"
).dt.date
return temp_df
def news_report_time_baidu(date: str = "20251126", cookie: str = None) -> pd.DataFrame:
"""
百度股市通-财报发行
https://finance.baidu.com/calendar
:param date: 查询日期 (格式: YYYYMMDD)
:param cookie: cookie
:return: 财报发行DataFrame
"""
return _baidu_finance_calendar(
date=date, cate="report_time", process_func=_process_report_data, cookie=cookie
)
if __name__ == "__main__":
news_economic_baidu_df = news_economic_baidu(date="20251126")
print(news_economic_baidu_df)
news_trade_notify_suspend_baidu_df = news_trade_notify_suspend_baidu(
date="20251126"
)
print(news_trade_notify_suspend_baidu_df)
news_trade_notify_dividend_baidu_df = news_trade_notify_dividend_baidu(
date="20251126"
)
print(news_trade_notify_dividend_baidu_df)
news_report_time_baidu_df = news_report_time_baidu(date="20251126")
print(news_report_time_baidu_df)