fa45d8aa5f
- health_checklist.json: 192.168.1.122→node122
- ocr_client.py: docstring IP→node122
- docs/market-data-requirements.md: IP→node122
- 所有API调用通过ProxyHandler({})绕过系统代理
Privoxy对node122:18003返回500,直连正常
387 lines
16 KiB
Python
387 lines
16 KiB
Python
#!/usr/bin/env python
|
||
# -*- coding:utf-8 -*-
|
||
"""
|
||
Date: 2024/12/12 17:00
|
||
Desc: 生意社网站采集大宗商品现货价格及相应基差数据, 数据时间段从 20110104-至今
|
||
备注:现期差 = 现货价格 - 期货价格(这里的期货价格为结算价)
|
||
黄金为 元/克, 白银为 元/千克, 玻璃现货为 元/平方米, 鸡蛋现货为 元/公斤, 鸡蛋期货为 元/500千克, 其余为 元/吨.
|
||
焦炭现货规格是: 一级冶金焦; 焦炭期货规格: 介于一级和二级之间, 焦炭现期差仅供参考.
|
||
铁矿石现货价格是: 湿吨, 铁矿石期货价格是: 干吨
|
||
网页地址: https://www.100ppi.com/sf/
|
||
历史数据可以通过修改 url 地址来获取, 比如: https://www.100ppi.com/sf/day-2017-09-12.html
|
||
发现生意社的 bugs:
|
||
1. 2018-09-12 周三 数据缺失是因为生意社源数据在该交易日缺失: https://www.100ppi.com/sf/day-2018-09-12.html
|
||
"""
|
||
|
||
import datetime
|
||
import re
|
||
import time
|
||
import warnings
|
||
from typing import List
|
||
|
||
import pandas as pd
|
||
|
||
from akshare.futures import cons
|
||
from akshare.futures.requests_fun import pandas_read_html_link
|
||
from akshare.futures.symbol_var import chinese_to_english
|
||
|
||
calendar = cons.get_calendar()
|
||
|
||
|
||
def futures_spot_price_daily(
|
||
start_day: str = "20210201",
|
||
end_day: str = "20210208",
|
||
vars_list: list = cons.contract_symbols,
|
||
):
|
||
"""
|
||
指定时间段内大宗商品现货价格及相应基差
|
||
https://www.100ppi.com/sf/
|
||
:param start_day: str 开始日期 format:YYYY-MM-DD 或 YYYYMMDD 或 datetime.date对象; 默认为当天
|
||
:param end_day: str 结束数据 format:YYYY-MM-DD 或 YYYYMMDD 或 datetime.date对象; 默认为当天
|
||
:param vars_list: list 合约品种如 [RB, AL]; 默认参数为所有商品
|
||
:return: 基差
|
||
:rtype: pandas.DataFrame
|
||
展期收益率数据:
|
||
var 商品品种 string
|
||
sp 现货价格 float
|
||
near_symbol 临近交割合约 string
|
||
near_price 临近交割合约结算价 float
|
||
dom_symbol 主力合约 string
|
||
dom_price 主力合约结算价 float
|
||
near_basis 临近交割合约相对现货的基差 float
|
||
dom_basis 主力合约相对现货的基差 float
|
||
near_basis_rate 临近交割合约相对现货的基差率 float
|
||
dom_basis_rate 主力合约相对现货的基差率 float
|
||
date 日期 string YYYYMMDD
|
||
"""
|
||
start_day = (
|
||
cons.convert_date(start_day) if start_day is not None else datetime.date.today()
|
||
)
|
||
end_day = (
|
||
cons.convert_date(end_day)
|
||
if end_day is not None
|
||
else cons.convert_date(cons.get_latest_data_date(datetime.datetime.now()))
|
||
)
|
||
df_list = []
|
||
while start_day <= end_day:
|
||
temp_df = futures_spot_price(start_day, vars_list)
|
||
if temp_df is False:
|
||
return pd.concat(df_list).reset_index(drop=True)
|
||
elif temp_df is not None:
|
||
df_list.append(temp_df)
|
||
start_day += datetime.timedelta(days=1)
|
||
if len(df_list) > 0:
|
||
temp_df = pd.concat(df_list)
|
||
temp_df.reset_index(drop=True, inplace=True)
|
||
return temp_df
|
||
|
||
|
||
def futures_spot_price(
|
||
date: str = "20240430", vars_list: list = cons.contract_symbols
|
||
) -> pd.DataFrame:
|
||
"""
|
||
指定交易日大宗商品现货价格及相应基差
|
||
https://www.100ppi.com/sf/day-2017-09-12.html
|
||
:param date: 开始日期 format: YYYY-MM-DD 或 YYYYMMDD 或 datetime.date 对象; 为空时为当天
|
||
:param vars_list: 合约品种如 RB、AL 等列表 为空时为所有商品
|
||
:return: pandas.DataFrame
|
||
展期收益率数据:
|
||
var 商品品种 string
|
||
sp 现货价格 float
|
||
near_symbol 临近交割合约 string
|
||
near_price 临近交割合约结算价 float
|
||
dom_symbol 主力合约 string
|
||
dom_price 主力合约结算价 float
|
||
near_basis 临近交割合约相对现货的基差 float
|
||
dom_basis 主力合约相对现货的基差 float
|
||
near_basis_rate 临近交割合约相对现货的基差率 float
|
||
dom_basis_rate 主力合约相对现货的基差率 float
|
||
date 日期 string YYYYMMDD
|
||
"""
|
||
date = cons.convert_date(date) if date is not None else datetime.date.today()
|
||
if date < datetime.date(2011, 1, 4):
|
||
raise Exception(
|
||
"数据源开始日期为 20110104, 请将获取数据时间点设置在 20110104 后"
|
||
)
|
||
if date.strftime("%Y%m%d") not in calendar:
|
||
warnings.warn(f"{date.strftime('%Y%m%d')}非交易日")
|
||
return pd.DataFrame()
|
||
u1 = "https://www.100ppi.com/sf/"
|
||
u2 = f"https://www.100ppi.com/sf/day-{date.strftime('%Y-%m-%d')}.html"
|
||
i = 1
|
||
while True:
|
||
for url in [u2, u1]:
|
||
try:
|
||
# url = u2
|
||
headers = {
|
||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,"
|
||
"image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7"
|
||
}
|
||
r = pandas_read_html_link(url, headers=headers)
|
||
string = r[0].loc[1, 1]
|
||
news = "".join(re.findall(r"[0-9]", string))
|
||
if news[3:11] == date.strftime("%Y%m%d"):
|
||
records = _check_information(r[1], date)
|
||
records.index = records["symbol"]
|
||
var_list_in_market = [i for i in vars_list if i in records.index]
|
||
temp_df = records.loc[var_list_in_market, :]
|
||
temp_df.reset_index(drop=True, inplace=True)
|
||
return temp_df
|
||
else:
|
||
time.sleep(3)
|
||
except Exception as e: # noqa: E722
|
||
print(
|
||
f"{date.strftime('%Y-%m-%d')}日生意社数据连接失败[错误信息:{e}],第{str(i)}次尝试,最多5次"
|
||
)
|
||
i += 1
|
||
if i > 5:
|
||
print(
|
||
f"{date.strftime('%Y-%m-%d')}日生意社数据连接失败, 如果当前交易日是 2018-09-12, "
|
||
f"由于生意社源数据缺失, 无法访问, 否则为重复访问已超过5次,您的地址被网站墙了,"
|
||
f"请保存好返回数据,稍后从该日期起重试"
|
||
)
|
||
return pd.DataFrame()
|
||
|
||
|
||
def _check_information(df_data, date):
|
||
"""
|
||
数据验证和计算模块
|
||
:param df_data: pandas.DataFrame 采集的数据
|
||
:param date: datetime.date 具体某一天 YYYYMMDD
|
||
:return: pandas.DataFrame
|
||
中间数据
|
||
symbol spot_price near_contract ... near_basis_rate dom_basis_rate date
|
||
CU 49620.00 cu1811 ... -0.002418 -0.003426 20181108
|
||
RB 4551.54 rb1811 ... -0.013521 -0.134359 20181108
|
||
ZN 22420.00 zn1811 ... -0.032114 -0.076271 20181108
|
||
AL 13900.00 al1812 ... 0.005396 0.003957 20181108
|
||
AU 274.10 au1811 ... 0.005655 0.020430 20181108
|
||
WR 4806.25 wr1903 ... -0.180026 -0.237035 20181108
|
||
RU 10438.89 ru1811 ... -0.020969 0.084406 20181108
|
||
PB 18600.00 pb1811 ... -0.001344 -0.010215 20181108
|
||
AG 3542.67 ag1811 ... -0.000754 0.009408 20181108
|
||
BU 4045.53 bu1811 ... -0.129904 -0.149679 20181108
|
||
HC 4043.33 hc1811 ... -0.035449 -0.088128 20...
|
||
"""
|
||
df_data = df_data.loc[:, [0, 1, 2, 3, 5, 6]]
|
||
df_data.columns = [
|
||
"symbol",
|
||
"spot_price",
|
||
"near_contract",
|
||
"near_contract_price",
|
||
"dominant_contract",
|
||
"dominant_contract_price",
|
||
]
|
||
records = pd.DataFrame()
|
||
for string in df_data["symbol"].tolist():
|
||
news = "".join(re.findall(r"[\u4e00-\u9fa5]", string))
|
||
if news == "":
|
||
news = string.strip()
|
||
|
||
"""
|
||
if string == "PTA":
|
||
news = "PTA"
|
||
else:
|
||
news = "".join(re.findall(r"[\u4e00-\u9fa5]", string))
|
||
"""
|
||
|
||
if news != "" and news not in [
|
||
"商品",
|
||
"价格",
|
||
"上海期货交易所",
|
||
"郑州商品交易所",
|
||
"大连商品交易所",
|
||
"广州期货交易所",
|
||
# 某些天网站没有数据,比如 20180912,此时返回"暂无数据",但并不是网站被墙了
|
||
"暂无数据",
|
||
]:
|
||
symbol = chinese_to_english(news)
|
||
record = pd.DataFrame(df_data[df_data["symbol"] == string])
|
||
record.loc[:, "symbol"] = symbol
|
||
record["spot_price"] = record["spot_price"].astype(float)
|
||
if (
|
||
symbol == "JD"
|
||
): # 鸡蛋现货为元/公斤, 鸡蛋期货为元/500千克, 其余元/吨(http://www.100ppi.com/sf/)
|
||
record.loc[:, "spot_price"] = float(record["spot_price"].iloc[0]) * 500
|
||
elif (
|
||
symbol == "FG"
|
||
): # 上表中现货单位为元/平方米, 期货单位为元/吨. 换算公式:元/平方米*80=元/吨(http://www.100ppi.com/sf/959.html)
|
||
record.loc[:, "spot_price"] = float(record["spot_price"].iloc[0]) * 80
|
||
elif (
|
||
symbol == "LH"
|
||
): # 上表中现货单位为元/公斤, 期货单位为元/吨. 换算公式:元/公斤*1000=元/吨(http://www.100ppi.com/sf/959.html)
|
||
record.loc[:, "spot_price"] = float(record["spot_price"].iloc[0]) * 1000
|
||
records = pd.concat([records, record])
|
||
|
||
# 20241129:如果某日没有数据,直接返回返回空表
|
||
if records.empty:
|
||
records = df_data.iloc[0:0]
|
||
records["near_basis"] = pd.Series(dtype="float")
|
||
records["dom_basis"] = pd.Series(dtype="float")
|
||
records["near_basis_rate"] = pd.Series(dtype="float")
|
||
records["dom_basis_rate"] = pd.Series(dtype="float")
|
||
records["date"] = pd.Series(dtype="object")
|
||
return records
|
||
|
||
records[["near_contract_price", "dominant_contract_price", "spot_price"]] = (
|
||
records[["near_contract_price", "dominant_contract_price", "spot_price"]
|
||
].astype("float")
|
||
)
|
||
|
||
records["near_contract"] = records["near_contract"].replace(
|
||
r"[^0-9]*(\d*)$", r"\g<1>", regex=True
|
||
)
|
||
records["dominant_contract"] = records["dominant_contract"].replace(
|
||
r"[^0-9]*(\d*)$", r"\g<1>", regex=True
|
||
)
|
||
|
||
records["near_month"] = records.loc[:, "near_contract"]
|
||
records["near_contract"] = records["symbol"] + records.loc[
|
||
:, "near_contract"
|
||
].astype("int").astype("str")
|
||
records["dominant_month"] = records.loc[:, "dominant_contract"]
|
||
records["dominant_contract"] = records["symbol"] + records.loc[
|
||
:, "dominant_contract"
|
||
].astype("int").astype("str")
|
||
|
||
records["near_contract"] = records["near_contract"].apply(
|
||
lambda x: (
|
||
x.lower()
|
||
if x[:-4]
|
||
in cons.market_exchange_symbols["shfe"]
|
||
+ cons.market_exchange_symbols["dce"]
|
||
else x
|
||
)
|
||
)
|
||
records["dominant_contract"] = records["dominant_contract"].apply(
|
||
lambda x: (
|
||
x.lower()
|
||
if x[:-4]
|
||
in cons.market_exchange_symbols["shfe"]
|
||
+ cons.market_exchange_symbols["dce"]
|
||
else x
|
||
)
|
||
)
|
||
records["near_contract"] = records["near_contract"].apply(
|
||
lambda x: (
|
||
x[:-4] + x[-3:] if x[:-4] in cons.market_exchange_symbols["czce"] else x
|
||
)
|
||
)
|
||
records["dominant_contract"] = records["dominant_contract"].apply(
|
||
lambda x: (
|
||
x[:-4] + x[-3:] if x[:-4] in cons.market_exchange_symbols["czce"] else x
|
||
)
|
||
)
|
||
|
||
records["near_basis"] = records["near_contract_price"] - records["spot_price"]
|
||
records["dom_basis"] = records["dominant_contract_price"] - records["spot_price"]
|
||
records["near_basis_rate"] = (
|
||
records["near_contract_price"] / records["spot_price"] - 1
|
||
)
|
||
records["dom_basis_rate"] = (
|
||
records["dominant_contract_price"] / records["spot_price"] - 1
|
||
)
|
||
# records.loc[:, "date"] = date.strftime("%Y%m%d")
|
||
records.insert(0, "date", date.strftime("%Y%m%d"))
|
||
records.reset_index(inplace=True, drop=True)
|
||
return records
|
||
|
||
|
||
def _join_head(content: pd.DataFrame) -> List:
|
||
headers = []
|
||
for s1, s2 in zip(content.iloc[0], content.iloc[1]):
|
||
if s1 != s2:
|
||
s = f"{s1}{s2}"
|
||
else:
|
||
s = s1
|
||
headers.append(s)
|
||
return headers
|
||
|
||
|
||
def futures_spot_price_previous(date: str = "20240430") -> pd.DataFrame:
|
||
"""
|
||
具体交易日大宗商品现货价格及相应基差
|
||
https://www.100ppi.com/sf/day-2017-09-12.html
|
||
:param date: 交易日; 历史日期
|
||
:type date: str
|
||
:return: 现货价格及相应基差
|
||
:rtype: pandas.DataFrame
|
||
"""
|
||
date = cons.convert_date(date) if date is not None else datetime.date.today()
|
||
if date < datetime.date(2011, 1, 4):
|
||
raise Exception(
|
||
"数据源开始日期为 20110104, 请将获取数据时间点设置在 20110104 后"
|
||
)
|
||
if date.strftime("%Y%m%d") not in calendar:
|
||
warnings.warn(f"{date.strftime('%Y%m%d')}非交易日")
|
||
return pd.DataFrame()
|
||
url = date.strftime("https://www.100ppi.com/sf2/day-%Y-%m-%d.html")
|
||
headers = {
|
||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,"
|
||
"image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7"
|
||
}
|
||
content = pandas_read_html_link(url, headers=headers)
|
||
main = content[1]
|
||
# Header
|
||
header = _join_head(main)
|
||
# Values
|
||
values = main[main[4].str.endswith("%")]
|
||
values.columns = header
|
||
# Basis
|
||
# 对于没有数据的天,xml文件中没有数据,所以content[2:-1]可能为空
|
||
if len(content[2:-1]) > 0:
|
||
basis = pd.concat(content[2:-1])
|
||
else:
|
||
basis = pd.DataFrame(columns=["主力合约基差", "主力合约基差(%)"])
|
||
|
||
basis.columns = ["主力合约基差", "主力合约基差(%)"]
|
||
# 20241125(jasonudu):因为部分日期,存在多个品种的现货价格,比如20151125的白糖、豆粕、豆油等,
|
||
# 如果用商品名来merge,会出现重复列名,所以改用index来merge
|
||
# basis["商品"] = values["商品"].tolist()
|
||
basis.index = values.index
|
||
basis = pd.merge(
|
||
values[["商品", "现货价格", "主力合约代码", "主力合约价格"]],
|
||
basis,
|
||
left_index=True,
|
||
right_index=True,
|
||
)
|
||
basis = pd.merge(
|
||
basis,
|
||
values[
|
||
[
|
||
"180日内主力基差最高",
|
||
"180日内主力基差最低",
|
||
"180日内主力基差平均",
|
||
]
|
||
],
|
||
left_index=True,
|
||
right_index=True,
|
||
)
|
||
basis.columns = [
|
||
"商品",
|
||
"现货价格",
|
||
"主力合约代码",
|
||
"主力合约价格",
|
||
"主力合约基差",
|
||
"主力合约变动百分比",
|
||
"180日内主力基差最高",
|
||
"180日内主力基差最低",
|
||
"180日内主力基差平均",
|
||
]
|
||
basis["主力合约变动百分比"] = basis["主力合约变动百分比"].str.strip("%")
|
||
basis.reset_index(inplace=True, drop=True)
|
||
return basis
|
||
|
||
|
||
if __name__ == "__main__":
|
||
futures_spot_price_daily_df = futures_spot_price_daily(
|
||
start_day="20260303", end_day="20260303", vars_list=['PL']
|
||
)
|
||
print(futures_spot_price_daily_df)
|
||
|
||
futures_spot_price_df = futures_spot_price(date="20260303")
|
||
print(futures_spot_price_df)
|
||
|
||
futures_spot_price_previous_df = futures_spot_price_previous(date="20240430")
|
||
print(futures_spot_price_previous_df)
|