#!/usr/bin/env python # -*- coding:utf-8 -*- """ Date: 2024/12/12 17:00 Desc: 生意社网站采集大宗商品现货价格及相应基差数据, 数据时间段从 20110104-至今 备注:现期差 = 现货价格 - 期货价格(这里的期货价格为结算价) 黄金为 元/克, 白银为 元/千克, 玻璃现货为 元/平方米, 鸡蛋现货为 元/公斤, 鸡蛋期货为 元/500千克, 其余为 元/吨. 焦炭现货规格是: 一级冶金焦; 焦炭期货规格: 介于一级和二级之间, 焦炭现期差仅供参考. 铁矿石现货价格是: 湿吨, 铁矿石期货价格是: 干吨 网页地址: https://www.100ppi.com/sf/ 历史数据可以通过修改 url 地址来获取, 比如: https://www.100ppi.com/sf/day-2017-09-12.html 发现生意社的 bugs: 1. 2018-09-12 周三 数据缺失是因为生意社源数据在该交易日缺失: https://www.100ppi.com/sf/day-2018-09-12.html """ import datetime import re import time import warnings from typing import List import pandas as pd from akshare.futures import cons from akshare.futures.requests_fun import pandas_read_html_link from akshare.futures.symbol_var import chinese_to_english calendar = cons.get_calendar() def futures_spot_price_daily( start_day: str = "20210201", end_day: str = "20210208", vars_list: list = cons.contract_symbols, ): """ 指定时间段内大宗商品现货价格及相应基差 https://www.100ppi.com/sf/ :param start_day: str 开始日期 format:YYYY-MM-DD 或 YYYYMMDD 或 datetime.date对象; 默认为当天 :param end_day: str 结束数据 format:YYYY-MM-DD 或 YYYYMMDD 或 datetime.date对象; 默认为当天 :param vars_list: list 合约品种如 [RB, AL]; 默认参数为所有商品 :return: 基差 :rtype: pandas.DataFrame 展期收益率数据: var 商品品种 string sp 现货价格 float near_symbol 临近交割合约 string near_price 临近交割合约结算价 float dom_symbol 主力合约 string dom_price 主力合约结算价 float near_basis 临近交割合约相对现货的基差 float dom_basis 主力合约相对现货的基差 float near_basis_rate 临近交割合约相对现货的基差率 float dom_basis_rate 主力合约相对现货的基差率 float date 日期 string YYYYMMDD """ start_day = ( cons.convert_date(start_day) if start_day is not None else datetime.date.today() ) end_day = ( cons.convert_date(end_day) if end_day is not None else cons.convert_date(cons.get_latest_data_date(datetime.datetime.now())) ) df_list = [] while start_day <= end_day: temp_df = futures_spot_price(start_day, vars_list) if temp_df is False: return pd.concat(df_list).reset_index(drop=True) elif temp_df is not None: df_list.append(temp_df) start_day += datetime.timedelta(days=1) if len(df_list) > 0: temp_df = pd.concat(df_list) temp_df.reset_index(drop=True, inplace=True) return temp_df def futures_spot_price( date: str = "20240430", vars_list: list = cons.contract_symbols ) -> pd.DataFrame: """ 指定交易日大宗商品现货价格及相应基差 https://www.100ppi.com/sf/day-2017-09-12.html :param date: 开始日期 format: YYYY-MM-DD 或 YYYYMMDD 或 datetime.date 对象; 为空时为当天 :param vars_list: 合约品种如 RB、AL 等列表 为空时为所有商品 :return: pandas.DataFrame 展期收益率数据: var 商品品种 string sp 现货价格 float near_symbol 临近交割合约 string near_price 临近交割合约结算价 float dom_symbol 主力合约 string dom_price 主力合约结算价 float near_basis 临近交割合约相对现货的基差 float dom_basis 主力合约相对现货的基差 float near_basis_rate 临近交割合约相对现货的基差率 float dom_basis_rate 主力合约相对现货的基差率 float date 日期 string YYYYMMDD """ date = cons.convert_date(date) if date is not None else datetime.date.today() if date < datetime.date(2011, 1, 4): raise Exception( "数据源开始日期为 20110104, 请将获取数据时间点设置在 20110104 后" ) if date.strftime("%Y%m%d") not in calendar: warnings.warn(f"{date.strftime('%Y%m%d')}非交易日") return pd.DataFrame() u1 = "https://www.100ppi.com/sf/" u2 = f"https://www.100ppi.com/sf/day-{date.strftime('%Y-%m-%d')}.html" i = 1 while True: for url in [u2, u1]: try: # url = u2 headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9," "image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7" } r = pandas_read_html_link(url, headers=headers) string = r[0].loc[1, 1] news = "".join(re.findall(r"[0-9]", string)) if news[3:11] == date.strftime("%Y%m%d"): records = _check_information(r[1], date) records.index = records["symbol"] var_list_in_market = [i for i in vars_list if i in records.index] temp_df = records.loc[var_list_in_market, :] temp_df.reset_index(drop=True, inplace=True) return temp_df else: time.sleep(3) except Exception as e: # noqa: E722 print( f"{date.strftime('%Y-%m-%d')}日生意社数据连接失败[错误信息:{e}],第{str(i)}次尝试,最多5次" ) i += 1 if i > 5: print( f"{date.strftime('%Y-%m-%d')}日生意社数据连接失败, 如果当前交易日是 2018-09-12, " f"由于生意社源数据缺失, 无法访问, 否则为重复访问已超过5次,您的地址被网站墙了," f"请保存好返回数据,稍后从该日期起重试" ) return pd.DataFrame() def _check_information(df_data, date): """ 数据验证和计算模块 :param df_data: pandas.DataFrame 采集的数据 :param date: datetime.date 具体某一天 YYYYMMDD :return: pandas.DataFrame 中间数据 symbol spot_price near_contract ... near_basis_rate dom_basis_rate date CU 49620.00 cu1811 ... -0.002418 -0.003426 20181108 RB 4551.54 rb1811 ... -0.013521 -0.134359 20181108 ZN 22420.00 zn1811 ... -0.032114 -0.076271 20181108 AL 13900.00 al1812 ... 0.005396 0.003957 20181108 AU 274.10 au1811 ... 0.005655 0.020430 20181108 WR 4806.25 wr1903 ... -0.180026 -0.237035 20181108 RU 10438.89 ru1811 ... -0.020969 0.084406 20181108 PB 18600.00 pb1811 ... -0.001344 -0.010215 20181108 AG 3542.67 ag1811 ... -0.000754 0.009408 20181108 BU 4045.53 bu1811 ... -0.129904 -0.149679 20181108 HC 4043.33 hc1811 ... -0.035449 -0.088128 20... """ df_data = df_data.loc[:, [0, 1, 2, 3, 5, 6]] df_data.columns = [ "symbol", "spot_price", "near_contract", "near_contract_price", "dominant_contract", "dominant_contract_price", ] records = pd.DataFrame() for string in df_data["symbol"].tolist(): news = "".join(re.findall(r"[\u4e00-\u9fa5]", string)) if news == "": news = string.strip() """ if string == "PTA": news = "PTA" else: news = "".join(re.findall(r"[\u4e00-\u9fa5]", string)) """ if news != "" and news not in [ "商品", "价格", "上海期货交易所", "郑州商品交易所", "大连商品交易所", "广州期货交易所", # 某些天网站没有数据,比如 20180912,此时返回"暂无数据",但并不是网站被墙了 "暂无数据", ]: symbol = chinese_to_english(news) record = pd.DataFrame(df_data[df_data["symbol"] == string]) record.loc[:, "symbol"] = symbol record["spot_price"] = record["spot_price"].astype(float) if ( symbol == "JD" ): # 鸡蛋现货为元/公斤, 鸡蛋期货为元/500千克, 其余元/吨(http://www.100ppi.com/sf/) record.loc[:, "spot_price"] = float(record["spot_price"].iloc[0]) * 500 elif ( symbol == "FG" ): # 上表中现货单位为元/平方米, 期货单位为元/吨. 换算公式:元/平方米*80=元/吨(http://www.100ppi.com/sf/959.html) record.loc[:, "spot_price"] = float(record["spot_price"].iloc[0]) * 80 elif ( symbol == "LH" ): # 上表中现货单位为元/公斤, 期货单位为元/吨. 换算公式:元/公斤*1000=元/吨(http://www.100ppi.com/sf/959.html) record.loc[:, "spot_price"] = float(record["spot_price"].iloc[0]) * 1000 records = pd.concat([records, record]) # 20241129:如果某日没有数据,直接返回返回空表 if records.empty: records = df_data.iloc[0:0] records["near_basis"] = pd.Series(dtype="float") records["dom_basis"] = pd.Series(dtype="float") records["near_basis_rate"] = pd.Series(dtype="float") records["dom_basis_rate"] = pd.Series(dtype="float") records["date"] = pd.Series(dtype="object") return records records[["near_contract_price", "dominant_contract_price", "spot_price"]] = ( records[["near_contract_price", "dominant_contract_price", "spot_price"] ].astype("float") ) records["near_contract"] = records["near_contract"].replace( r"[^0-9]*(\d*)$", r"\g<1>", regex=True ) records["dominant_contract"] = records["dominant_contract"].replace( r"[^0-9]*(\d*)$", r"\g<1>", regex=True ) records["near_month"] = records.loc[:, "near_contract"] records["near_contract"] = records["symbol"] + records.loc[ :, "near_contract" ].astype("int").astype("str") records["dominant_month"] = records.loc[:, "dominant_contract"] records["dominant_contract"] = records["symbol"] + records.loc[ :, "dominant_contract" ].astype("int").astype("str") records["near_contract"] = records["near_contract"].apply( lambda x: ( x.lower() if x[:-4] in cons.market_exchange_symbols["shfe"] + cons.market_exchange_symbols["dce"] else x ) ) records["dominant_contract"] = records["dominant_contract"].apply( lambda x: ( x.lower() if x[:-4] in cons.market_exchange_symbols["shfe"] + cons.market_exchange_symbols["dce"] else x ) ) records["near_contract"] = records["near_contract"].apply( lambda x: ( x[:-4] + x[-3:] if x[:-4] in cons.market_exchange_symbols["czce"] else x ) ) records["dominant_contract"] = records["dominant_contract"].apply( lambda x: ( x[:-4] + x[-3:] if x[:-4] in cons.market_exchange_symbols["czce"] else x ) ) records["near_basis"] = records["near_contract_price"] - records["spot_price"] records["dom_basis"] = records["dominant_contract_price"] - records["spot_price"] records["near_basis_rate"] = ( records["near_contract_price"] / records["spot_price"] - 1 ) records["dom_basis_rate"] = ( records["dominant_contract_price"] / records["spot_price"] - 1 ) # records.loc[:, "date"] = date.strftime("%Y%m%d") records.insert(0, "date", date.strftime("%Y%m%d")) records.reset_index(inplace=True, drop=True) return records def _join_head(content: pd.DataFrame) -> List: headers = [] for s1, s2 in zip(content.iloc[0], content.iloc[1]): if s1 != s2: s = f"{s1}{s2}" else: s = s1 headers.append(s) return headers def futures_spot_price_previous(date: str = "20240430") -> pd.DataFrame: """ 具体交易日大宗商品现货价格及相应基差 https://www.100ppi.com/sf/day-2017-09-12.html :param date: 交易日; 历史日期 :type date: str :return: 现货价格及相应基差 :rtype: pandas.DataFrame """ date = cons.convert_date(date) if date is not None else datetime.date.today() if date < datetime.date(2011, 1, 4): raise Exception( "数据源开始日期为 20110104, 请将获取数据时间点设置在 20110104 后" ) if date.strftime("%Y%m%d") not in calendar: warnings.warn(f"{date.strftime('%Y%m%d')}非交易日") return pd.DataFrame() url = date.strftime("https://www.100ppi.com/sf2/day-%Y-%m-%d.html") headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp," "image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7" } content = pandas_read_html_link(url, headers=headers) main = content[1] # Header header = _join_head(main) # Values values = main[main[4].str.endswith("%")] values.columns = header # Basis # 对于没有数据的天,xml文件中没有数据,所以content[2:-1]可能为空 if len(content[2:-1]) > 0: basis = pd.concat(content[2:-1]) else: basis = pd.DataFrame(columns=["主力合约基差", "主力合约基差(%)"]) basis.columns = ["主力合约基差", "主力合约基差(%)"] # 20241125(jasonudu):因为部分日期,存在多个品种的现货价格,比如20151125的白糖、豆粕、豆油等, # 如果用商品名来merge,会出现重复列名,所以改用index来merge # basis["商品"] = values["商品"].tolist() basis.index = values.index basis = pd.merge( values[["商品", "现货价格", "主力合约代码", "主力合约价格"]], basis, left_index=True, right_index=True, ) basis = pd.merge( basis, values[ [ "180日内主力基差最高", "180日内主力基差最低", "180日内主力基差平均", ] ], left_index=True, right_index=True, ) basis.columns = [ "商品", "现货价格", "主力合约代码", "主力合约价格", "主力合约基差", "主力合约变动百分比", "180日内主力基差最高", "180日内主力基差最低", "180日内主力基差平均", ] basis["主力合约变动百分比"] = basis["主力合约变动百分比"].str.strip("%") basis.reset_index(inplace=True, drop=True) return basis if __name__ == "__main__": futures_spot_price_daily_df = futures_spot_price_daily( start_day="20260303", end_day="20260303", vars_list=['PL'] ) print(futures_spot_price_daily_df) futures_spot_price_df = futures_spot_price(date="20260303") print(futures_spot_price_df) futures_spot_price_previous_df = futures_spot_price_previous(date="20240430") print(futures_spot_price_previous_df)