Files
MoFin/venv/lib/python3.12/site-packages/yfinance/scrapers/history.py
T
知微 fa45d8aa5f fix: 小果地址统一node122(兼容LAN+EasyTier)
- health_checklist.json: 192.168.1.122→node122
- ocr_client.py: docstring IP→node122
- docs/market-data-requirements.md: IP→node122
- 所有API调用通过ProxyHandler({})绕过系统代理
  Privoxy对node122:18003返回500,直连正常
2026-06-30 02:56:35 +08:00

3406 lines
164 KiB
Python

from yfinance._http import new_session
from math import isclose
import bisect
import datetime as _datetime
import dateutil as _dateutil
import logging
import numpy as np
import pandas as pd
import time as _time
import warnings
from yfinance import utils
from yfinance.config import YfConfig
from yfinance.const import _BASE_URL_, _PRICE_COLNAMES_, period_default, _SENTINEL_
from yfinance.exceptions import YFDataException, YFInvalidPeriodError, YFPricesMissingError, YFRateLimitError, YFTzMissingError
class PriceHistory:
def __init__(self, data, ticker, tz, session=None):
self._data = data
self.ticker = ticker.upper()
self.tz = tz
self.session = session or new_session()
self._history_cache = {}
self._history_metadata = None
self._history_metadata_formatted = False
self._dividends = None
self._splits = None
self._capital_gains = None
# Limit recursion depth when repairing prices
self._reconstruct_start_interval = None
self._last_error = None
@utils.log_indent_decorator
def history(self, period=period_default, interval="1d",
start=None, end=None, prepost=False, actions=True,
auto_adjust=True, back_adjust=False, repair=False, keepna=False,
rounding=False, timeout=10,
raise_errors=False) -> pd.DataFrame:
"""
:Parameters:
period : str
| Valid periods: 1d,5d,1mo,3mo,6mo,1y,2y,5y,10y,ytd,max
| Default: '1mo' if start & end None
| Can combine with start/end e.g. end = start + period
interval : str
| Valid intervals: 1m,2m,5m,15m,30m,60m,90m,1h,1d,5d,1wk,1mo,3mo
| Intraday data cannot extend last 60 days
start : str
| Download start date string (YYYY-MM-DD) or _datetime, inclusive.
| Default: 99 years ago
| E.g. for start="2020-01-01", first data point = "2020-01-01"
end : str
| Download end date string (YYYY-MM-DD) or _datetime, exclusive.
| Default: now
| E.g. for end="2023-01-01", last data point = "2022-12-31"
prepost : bool
| Include Pre and Post market data in results?
| Default: False
auto_adjust : bool
| Adjust all OHLC automatically?
| Default: True
back_adjust : bool
| Back-adjusted data to mimic true historical prices
repair : bool
| Fixes price errors in Yahoo data: 100x, missing, bad dividend adjust.
| Default: False
| Full details at: :doc:`../advanced/price_repair`.
keepna : bool
| Keep NaN rows returned by Yahoo?
| Default: False
rounding : bool
| Optional: Round values to 2 decimal places?
| Default: False = use precision suggested by Yahoo!
timeout : None or float
| Optional: timeout fetches after N seconds
| Default: 10 seconds
raise_errors : bool
If True, then raise errors as Exceptions instead of logging.
"""
logger = utils.get_yf_logger()
if raise_errors:
warnings.warn("'raise_errors' deprecated, do: yf.config.debug.hide_exceptions = False", DeprecationWarning, stacklevel=5)
interval_user = interval
if period == period_default:
period_user = None
if start or end:
period = None
else:
period = '1mo'
else:
period_user = period
if repair and interval in ["5d", "1wk", "1mo", "3mo"]:
# Yahoo's way of adjusting mutiday intervals is fundamentally broken.
# Have to fetch 1d, adjust, then resample.
if interval == '5d':
raise ValueError("Yahoo's interval '5d' is nonsense, not supported with repair")
if start is None and end is None and period is not None:
# Convert period to start -> end
tz = self.tz
if tz is None:
# Every valid ticker has a timezone. A missing timezone is a problem.
_exception = YFTzMissingError(self.ticker)
err_msg = str(_exception)
self._last_error = err_msg.split(': ', 1)[1]
if raise_errors or (not YfConfig.debug.hide_exceptions):
raise _exception
else:
logger.error(err_msg)
return utils.empty_df()
if period == 'ytd':
start = _datetime.date(pd.Timestamp.now('UTC').tz_convert(tz).year, 1, 1)
else:
start = pd.Timestamp.now('UTC').tz_convert(tz).date()
start -= utils._interval_to_timedelta(period)
start -= _datetime.timedelta(days=4)
period_user = period
period = None
interval = '1d'
start_user = start
end_user = end
if start or end or (period and period.lower() == "max"):
# Check can get TZ. Fail => probably delisted
tz = self.tz
if tz is None:
# Every valid ticker has a timezone. A missing timezone is a problem.
_exception = YFTzMissingError(self.ticker)
err_msg = str(_exception)
self._last_error = err_msg.split(': ', 1)[1]
if raise_errors or (not YfConfig.debug.hide_exceptions):
raise _exception
else:
logger.error(err_msg)
return utils.empty_df()
if start:
start_dt = utils._parse_user_dt(start, tz)
start = int(start_dt.timestamp())
if end:
end_dt = utils._parse_user_dt(end, tz)
end = int(end_dt.timestamp())
if period is None:
if not (start or end):
period = '1mo' # default
elif not start:
start_dt = end_dt - utils._interval_to_timedelta('1mo')
start = int(start_dt.timestamp())
elif not end:
end_dt = pd.Timestamp.now('UTC').tz_convert(tz)
end = int(end_dt.timestamp())
else:
if period.lower() == "max":
if end is None:
end = int(_time.time())
if start is None:
if interval == "1m":
start = end - 691200 # 8 days
elif interval in ("2m", "5m", "15m", "30m", "90m"):
start = end - 5184000 # 60 days
elif interval in ("1h", "60m"):
start = end - 63072000 # 730 days
else:
start = end - 3122064000 # 99 years
start += 5 # allow for processing time
elif start and end:
raise ValueError("Setting period, start and end is nonsense. Set maximum 2 of them.")
elif start or end:
period_td = utils._interval_to_timedelta(period)
if end is None:
end_dt = start_dt + period_td
end = int(end_dt.timestamp())
if start is None:
start_dt = end_dt - period_td
start = int(start_dt.timestamp())
period = None
if start or end:
params = {"period1": start, "period2": end}
else:
period = period.lower()
params = {"range": period}
params["interval"] = interval.lower()
params["includePrePost"] = prepost
# 1) fix weird bug with Yahoo! - returning 60m for 30m bars
if params["interval"] == "30m":
params["interval"] = "15m"
# if the ticker is MUTUALFUND or ETF, then get capitalGains events
params["events"] = "div,splits,capitalGains"
params_pretty = dict(params)
tz = self.tz
for k in ["period1", "period2"]:
if k in params_pretty:
params_pretty[k] = str(pd.Timestamp(params[k], unit='s').tz_localize("UTC").tz_convert(tz))
logger.debug(f'{self.ticker}: Yahoo GET parameters: {str(params_pretty)}')
# Getting data from json
url = f"{_BASE_URL_}/v8/finance/chart/{self.ticker}"
data = None
get_fn = self._data.get
if end is not None:
end_dt = pd.Timestamp(end, unit='s').tz_localize("UTC")
dt_now = pd.Timestamp.now('UTC')
data_delay = _datetime.timedelta(minutes=30)
if end_dt + data_delay <= dt_now:
# Date range in past so safe to fetch through cache:
get_fn = self._data.cache_get
try:
data = get_fn(
url=url,
params=params,
timeout=timeout
)
if "Will be right back" in data.text or data is None:
raise YFDataException("*** YAHOO! FINANCE IS CURRENTLY DOWN! ***")
data = data.json()
# Special case for rate limits
except YFRateLimitError:
raise
except Exception:
if raise_errors or (not YfConfig.debug.hide_exceptions):
raise
# Store the meta data that gets retrieved simultaneously
try:
safe_chart = (data or {}).get('chart') or {}
result_list = safe_chart.get('result')
if isinstance(result_list, list) and len(result_list) > 0:
first_item = result_list[0] or {}
meta = first_item.get('meta') or {}
else:
meta = {}
except Exception:
meta = {}
self._history_metadata = meta
self._history_metadata['YF repair?'] = repair
intraday = params["interval"][-1] in ("m", 'h')
_price_data_debug = ''
if start or period is None or period.lower() == "max":
_price_data_debug += f' ({params["interval"]} '
if start_user is not None:
_price_data_debug += f'{start_user}'
elif not intraday:
_price_data_debug += f'{pd.Timestamp(start, unit="s").tz_localize("UTC").tz_convert(tz).date()}'
else:
_price_data_debug += f'{pd.Timestamp(start, unit="s").tz_localize("UTC").tz_convert(tz)}'
_price_data_debug += ' -> '
if end_user is not None:
_price_data_debug += f'{end_user})'
elif not intraday:
_price_data_debug += f'{pd.Timestamp(end, unit="s").tz_localize("UTC").tz_convert(tz).date()})'
else:
_price_data_debug += f'{pd.Timestamp(end, unit="s").tz_localize("UTC").tz_convert(tz)})'
else:
_price_data_debug += f' (period={period})'
fail = False
if data is None or not isinstance(data, dict):
_exception = YFPricesMissingError(self.ticker, _price_data_debug)
fail = True
elif isinstance(data, dict) and 'status_code' in data:
_price_data_debug += f"(Yahoo status_code = {data['status_code']})"
_exception = YFPricesMissingError(self.ticker, _price_data_debug)
fail = True
elif "chart" in data and data["chart"] and data["chart"]["error"]:
_price_data_debug += ' (Yahoo error = "' + data["chart"]["error"]["description"] + '")'
_exception = YFPricesMissingError(self.ticker, _price_data_debug)
fail = True
elif "chart" not in data or not data["chart"] or data["chart"]["result"] is None or not data["chart"]["result"] or not data["chart"]["result"][0]["indicators"]["quote"][0]:
_exception = YFPricesMissingError(self.ticker, _price_data_debug)
fail = True
elif period and period not in self._history_metadata['validRanges'] and not utils.is_valid_period_format(period):
# User provided a bad period
_exception = YFInvalidPeriodError(self.ticker, period, ", ".join(self._history_metadata['validRanges']))
fail = True
if fail:
err_msg = str(_exception)
self._last_error = err_msg.split(': ', 1)[1]
if raise_errors or (not YfConfig.debug.hide_exceptions):
raise _exception
else:
logger.error(err_msg)
if self._reconstruct_start_interval is not None and self._reconstruct_start_interval == interval:
self._reconstruct_start_interval = None
return utils.empty_df()
# Select useful info from metadata
quote_type = self._history_metadata["instrumentType"]
expect_capital_gains = quote_type in ('MUTUALFUND', 'ETF')
tz_exchange = self._history_metadata["exchangeTimezoneName"]
currency = self._history_metadata["currency"]
# Process custom periods
if period and period not in self._history_metadata.get("validRanges", []):
end = int(_time.time())
end_dt = pd.Timestamp(end, unit='s').tz_localize("UTC")
start = _datetime.date.fromtimestamp(end)
start -= utils._interval_to_timedelta(period)
start -= _datetime.timedelta(days=4)
# parse quotes
quotes = utils.parse_quotes(data["chart"]["result"][0])
# Yahoo bug fix - it often appends latest price even if after end date
if end and not quotes.empty:
if quotes.index[-1] >= end_dt.tz_convert('UTC').tz_localize(None):
quotes = quotes.drop(quotes.index[-1])
if quotes.empty:
msg = f'{self.ticker}: yfinance received OHLC data: EMPTY'
elif len(quotes) == 1:
msg = f'{self.ticker}: yfinance received OHLC data: {quotes.index[0]} only'
else:
msg = f'{self.ticker}: yfinance received OHLC data: {quotes.index[0]} -> {quotes.index[-1]}'
logger.debug(msg)
# 2) fix weird bug with Yahoo! - returning 60m for 30m bars
if interval.lower() == "30m":
logger.debug(f'{self.ticker}: resampling 30m OHLC from 15m')
quotes2 = quotes.resample('30min')
quotes = pd.DataFrame(index=quotes2.last().index, data={
'Open': quotes2['Open'].first(),
'High': quotes2['High'].max(),
'Low': quotes2['Low'].min(),
'Close': quotes2['Close'].last(),
'Adj Close': quotes2['Adj Close'].last(),
'Volume': quotes2['Volume'].sum()
})
# Note: ordering is important. If you change order, run the tests!
quotes = utils.set_df_tz(quotes, interval, tz_exchange)
quotes = utils.fix_Yahoo_dst_issue(quotes, interval)
intraday = params["interval"][-1] in ("m", 'h')
if not prepost and intraday and "tradingPeriods" in self._history_metadata:
tps = self._history_metadata["tradingPeriods"]
if not isinstance(tps, pd.DataFrame):
self._history_metadata = utils.format_history_metadata(self._history_metadata, tradingPeriodsOnly=True)
self._history_metadata_formatted = True
tps = self._history_metadata["tradingPeriods"]
quotes = utils.fix_Yahoo_returning_prepost_unrequested(quotes, interval, tps)
if quotes.empty:
msg = f'{self.ticker}: OHLC after cleaning: EMPTY'
elif len(quotes) == 1:
msg = f'{self.ticker}: OHLC after cleaning: {quotes.index[0]} only'
else:
msg = f'{self.ticker}: OHLC after cleaning: {quotes.index[0]} -> {quotes.index[-1]}'
logger.debug(msg)
# actions
dividends, splits, capital_gains = utils.parse_actions(data["chart"]["result"][0])
if not expect_capital_gains:
capital_gains = None
if splits is not None:
splits = utils.set_df_tz(splits, interval, tz_exchange)
self._splits = splits['Stock Splits'].rename_axis('Date')
else:
self._splits = pd.Series()
if dividends is not None:
dividends = utils.set_df_tz(dividends, interval, tz_exchange)
if 'currency' in dividends.columns:
# Rare, only seen with Vietnam market, or
# companies that distribute dividends in a different currency
self._dividends = dividends.rename_axis('Date')
price_currency = self._history_metadata['currency']
if price_currency is None:
price_currency = ''
f_currency_mismatch = dividends['currency'] != price_currency
if f_currency_mismatch.any():
if repair and price_currency != '':
# Attempt repair = currency conversion
dividends = self._dividends_convert_fx(dividends, price_currency, repair)
dividends = dividends.drop('currency', axis=1)
else:
self._dividends = dividends['Dividends'].rename_axis('Date')
else:
self._dividends = pd.Series()
if capital_gains is not None:
capital_gains = utils.set_df_tz(capital_gains, interval, tz_exchange)
self._capital_gains = capital_gains['Capital Gains'].rename_axis('Date')
else:
self._capital_gains = pd.Series()
if start is not None:
if not quotes.empty:
start_d = quotes.index[0].floor('D')
if dividends is not None:
dividends = dividends.loc[start_d:]
if capital_gains is not None:
capital_gains = capital_gains.loc[start_d:]
if splits is not None:
splits = splits.loc[start_d:]
if end is not None:
# -1 because date-slice end is inclusive
end_dt_sub1 = end_dt - pd.Timedelta(1)
if dividends is not None:
dividends = dividends[:end_dt_sub1]
if capital_gains is not None:
capital_gains = capital_gains[:end_dt_sub1]
if splits is not None:
splits = splits[:end_dt_sub1]
# Prepare for combine
intraday = params["interval"][-1] in ("m", 'h')
if not intraday:
# If localizing a midnight during DST transition hour when clocks roll back,
# meaning clock hits midnight twice, then use the 2nd (ambiguous=True)
quotes.index = pd.to_datetime(quotes.index.date).tz_localize(tz_exchange, ambiguous=True, nonexistent='shift_forward')
if dividends.shape[0] > 0:
dividends.index = pd.to_datetime(dividends.index.date).tz_localize(tz_exchange, ambiguous=True, nonexistent='shift_forward')
if splits.shape[0] > 0:
splits.index = pd.to_datetime(splits.index.date).tz_localize(tz_exchange, ambiguous=True, nonexistent='shift_forward')
# Combine
df = quotes.sort_index()
if dividends.shape[0] > 0:
df = utils.safe_merge_dfs(df, dividends, interval)
if "Dividends" in df.columns:
df.loc[df["Dividends"].isna(), "Dividends"] = 0
else:
df["Dividends"] = 0.0
if splits.shape[0] > 0:
df = utils.safe_merge_dfs(df, splits, interval)
if "Stock Splits" in df.columns:
df.loc[df["Stock Splits"].isna(), "Stock Splits"] = 0
else:
df["Stock Splits"] = 0.0
if expect_capital_gains:
if capital_gains.shape[0] > 0:
df = utils.safe_merge_dfs(df, capital_gains, interval)
if "Capital Gains" in df.columns:
df.loc[df["Capital Gains"].isna(), "Capital Gains"] = 0
else:
df["Capital Gains"] = 0.0
if df.empty:
msg = f'{self.ticker}: OHLC after combining events: EMPTY'
elif len(df) == 1:
msg = f'{self.ticker}: OHLC after combining events: {df.index[0]} only'
else:
msg = f'{self.ticker}: OHLC after combining events: {df.index[0]} -> {df.index[-1]}'
logger.debug(msg)
df, last_trade = utils.fix_Yahoo_returning_live_separate(df, params["interval"], tz_exchange, prepost, repair=repair, currency=currency)
if last_trade is not None:
self._history_metadata['lastTrade'] = {'Price':last_trade['Close'], "Time":last_trade.name}
df = df[~df.index.duplicated(keep='first')] # must do before repair
if repair:
# Do this before auto/back adjust
logger.debug(f'{self.ticker}: checking OHLC for repairs ...')
df = df.sort_index()
# Must fix bad 'Adj Close' & dividends before 100x/split errors.
# First make currency consistent. On some exchanges, dividends often in different currency
# to prices, e.g. £ vs pence.
df, currency = self._standardise_currency(df, currency)
self._history_metadata['currency'] = currency
f_na = df['Volume'].isna()
if f_na.any():
# Because converting to Int, need to handle NaNs
df.loc[f_na, 'Volume'] = 0
df = self._fix_bad_div_adjust(df, interval, prepost, currency)
# Need the latest/last row to be repaired before 100x/split repair:
if not df.empty:
df_last = self._fix_zeroes(df.iloc[-1:], interval, tz_exchange, prepost)
if 'Repaired?' not in df.columns:
df['Repaired?'] = False
df = pd.concat([df.drop(df.index[-1]), df_last])
df = self._fix_unit_mixups(df, interval, tz_exchange, prepost)
df = self._fix_bad_stock_splits(df, interval, tz_exchange)
# Must repair 100x and split errors before price reconstruction
df = self._fix_zeroes(df, interval, tz_exchange, prepost)
# New:
df = self._repair_capital_gains(df)
df = df.sort_index()
# Auto/back adjust
try:
if auto_adjust:
df = utils.auto_adjust(df)
elif back_adjust:
df = utils.back_adjust(df)
except Exception as e:
if raise_errors or (not YfConfig.debug.hide_exceptions):
raise
if auto_adjust:
err_msg = "auto_adjust failed with %s" % e
else:
err_msg = "back_adjust failed with %s" % e
self._last_error = err_msg
logger.error('%s: %s' % (self.ticker, err_msg))
if rounding:
df = np.round(df, data["chart"]["result"][0]["meta"]["priceHint"])
df['Volume'] = df['Volume'].fillna(0).astype(np.int64)
if intraday:
df.index.name = "Datetime"
else:
df.index.name = "Date"
# missing rows cleanup
if not actions:
df = df.drop(columns=["Dividends", "Stock Splits", "Capital Gains"], errors='ignore')
if not keepna:
data_colnames = _PRICE_COLNAMES_ + ['Volume'] + ['Dividends', 'Stock Splits', 'Capital Gains']
data_colnames = [c for c in data_colnames if c in df.columns]
mask_nan_or_zero = (df[data_colnames].isna() | (df[data_colnames] == 0)).all(axis=1)
df = df.drop(mask_nan_or_zero.index[mask_nan_or_zero])
if interval != interval_user:
df = self._resample(df, interval, interval_user, period_user)
if df.empty:
msg = f'{self.ticker}: yfinance returning OHLC: EMPTY'
elif len(df) == 1:
msg = f'{self.ticker}: yfinance returning OHLC: {df.index[0]} only'
else:
msg = f'{self.ticker}: yfinance returning OHLC: {df.index[0]} -> {df.index[-1]}'
logger.debug(msg)
# Don't care that Pandas hid this. If they do it to improve performance, we do it.
df = df._consolidate()
if self._reconstruct_start_interval is not None and self._reconstruct_start_interval == interval:
self._reconstruct_start_interval = None
return df
def _get_history_cache(self, period="max", interval="1d", repair=False) -> pd.DataFrame:
cache_key = (interval, period, repair)
if cache_key not in self._history_cache.keys():
df = self.history(period=period, interval=interval, repair=repair, prepost=True)
self._history_cache[cache_key] = {'prices': df, 'dividends': self._dividends,
'splits': self._splits,
'capital gains': self._capital_gains}
return self._history_cache[cache_key]
def get_history_metadata(self, repair=_SENTINEL_) -> dict:
"""
repair default value depends on whether user requested price repair
with previous history() call. If user did not set repair here, then
it is set to match previous history() call.
"""
# - repair affects currency, particularly GBp -> GBP
if self._history_metadata is None or 'tradingPeriods' not in self._history_metadata:
# Request intraday data, because then Yahoo returns exchange schedule (tradingPeriods).
if repair == _SENTINEL_:
if self._history_metadata is not None:
repair = self._history_metadata['YF repair?']
else:
# default
repair = False
self._get_history_cache(period="5d", interval="1h", repair=repair)['prices']
if self._history_metadata_formatted is False:
self._history_metadata = utils.format_history_metadata(self._history_metadata)
self._history_metadata_formatted = True
return self._history_metadata
def get_dividends(self, period="max", repair=False) -> pd.Series:
return self._get_history_cache(interval='1d', period=period, repair=repair)['dividends']
def get_capital_gains(self, period="max", repair=False) -> pd.Series:
return self._get_history_cache(interval='1d', period=period, repair=repair)['capital gains']
def get_splits(self, period="max", repair=False) -> pd.Series:
return self._get_history_cache(interval='1d', period=period, repair=repair)['splits']
def get_actions(self, period="max") -> pd.Series:
data = self._get_history_cache(period=period)
df = data['prices']
divs = data['dividends']
if divs is not None and isinstance(divs, pd.DataFrame) and 'currency' in divs.columns:
# Add dividends currency column
df = utils.safe_merge_dfs(df.drop('Dividends', axis=1), divs, '1d')
df['currency'] = df['currency'].fillna('')
df['Dividends'] = df['Dividends'].fillna(0.0)
df = df.rename(columns={'currency': 'Dividends FX'})
cols = ['Dividends', 'Dividends FX', 'Stock Splits', 'Capital Gains']
actions = df[[c for c in cols if c in df.columns]]
cols_numeric = ['Dividends', 'Stock Splits', 'Capital Gains']
cols_numeric = [c for c in cols_numeric if c in actions.columns]
actions = actions[(actions[cols_numeric]!=0).any(axis=1)]
for c in cols_numeric:
if (actions[c] == 0.0).all():
actions = actions.drop(c, axis=1)
return actions
def _resample(self, df, df_interval, target_interval, period=None) -> pd.DataFrame:
# resample
if df_interval == target_interval:
return df
offset = None
origin = 'epoch' # default
if target_interval == '1wk':
if period == 'ytd':
resample_period = '7D' # was 'W'
year_start = pd.Timestamp(f"{_datetime.datetime.now().year}-01-01")
origin = year_start.tz_localize(df.index.tz)
else:
resample_period = 'W-MON'
elif target_interval == '5d':
resample_period = '5D'
if period == 'ytd':
year_start = pd.Timestamp(f"{_datetime.datetime.now().year}-01-01")
origin = year_start.tz_localize(df.index.tz)
elif target_interval == '1mo':
resample_period = 'MS'
elif target_interval == '3mo':
if period == 'ytd':
align_month = 'JAN'
else:
align_month = _datetime.datetime.now().strftime('%b').upper()
resample_period = f"QS-{align_month}"
elif target_interval == '1d':
resample_period = '1D'
else:
raise ValueError(f"Not implemented resampling to interval '{target_interval}'")
resample_map = {
'Open': 'first', 'Low': 'min', 'High': 'max', 'Close': 'last',
'Volume': 'sum', 'Dividends': 'sum', 'Stock Splits': 'prod'
}
if 'Repaired?' in df.columns:
resample_map['Repaired?'] = 'any'
if 'Adj Close' in df.columns:
resample_map['Adj Close'] = resample_map['Close']
if 'Capital Gains' in df.columns:
resample_map['Capital Gains'] = 'sum'
df.loc[df['Stock Splits']==0.0, 'Stock Splits'] = 1.0
if origin != 'epoch':
df2 = df.resample(resample_period, label='left', closed='left', origin=origin).agg(resample_map)
else:
df2 = df.resample(resample_period, label='left', closed='left', offset=offset).agg(resample_map)
df2.loc[df2['Stock Splits']==1.0, 'Stock Splits'] = 0.0
# Handle NaNs from very long holidays.
prev_close = df2['Close'].shift(1).ffill()
for c in ['Open', 'High', 'Low', 'Close']:
df2[c] = df2[c].fillna(prev_close)
return df2
@utils.log_indent_decorator
def _reconstruct_intervals_batch(self, df, interval, prepost, tag=-1):
# Reconstruct values in df using finer-grained price data. Delimiter marks what to reconstruct
logger = utils.get_yf_logger()
log_extras = {'yf_cat': 'price-reconstruct', 'yf_interval': interval, 'yf_symbol': self.ticker}
if not isinstance(df, pd.DataFrame):
raise ValueError("'df' must be a Pandas DataFrame not", type(df))
if interval == "1m":
# Can't go smaller than 1m so can't reconstruct
return df
if interval[1:] in ['d', 'wk', 'mo']:
# Interday data always includes pre & post
prepost = True
intraday = False
else:
intraday = True
price_cols = [c for c in _PRICE_COLNAMES_ if c in df]
data_cols = price_cols + ["Volume"]
# If interval is weekly then can construct with daily. But if smaller intervals then
# restricted to recent times:
intervals = ["1wk", "1d", "1h", "30m", "15m", "5m", "2m", "1m"]
itds = {i: utils._interval_to_timedelta(interval) for i in intervals}
nexts = {intervals[i]: intervals[i + 1] for i in range(len(intervals) - 1)}
min_lookbacks = {"1wk": None, "1d": None, "1h": _datetime.timedelta(days=730)}
for i in ["30m", "15m", "5m", "2m"]:
min_lookbacks[i] = _datetime.timedelta(days=60)
min_lookbacks["1m"] = _datetime.timedelta(days=30)
if interval in nexts:
sub_interval = nexts[interval]
td_range = itds[interval]
else:
logger.warning(f"Have not implemented price reconstruct for '{interval}' interval. Contact developers")
if "Repaired?" not in df.columns:
df["Repaired?"] = False
return df
# Limit max reconstruction depth to 2:
if self._reconstruct_start_interval is None:
self._reconstruct_start_interval = interval
if interval != self._reconstruct_start_interval and interval != nexts[self._reconstruct_start_interval]:
msg = "Hit max depth of 2 ('{}'->'{}'->'{}')".format(self._reconstruct_start_interval, nexts[self._reconstruct_start_interval], interval)
logger.info(msg, extra=log_extras)
return df
df = df.sort_index()
f_repair = df[data_cols].to_numpy() == tag
f_repair_rows = f_repair.any(axis=1)
# Ignore old intervals for which Yahoo won't return finer data:
m = min_lookbacks[sub_interval]
if m is None:
min_dt = None
else:
m -= _datetime.timedelta(days=1) # allow space for 1-day padding
min_dt = pd.Timestamp.now('UTC') - m
min_dt = min_dt.tz_convert(df.index.tz).ceil("D")
logger.debug(f"min_dt={min_dt} interval={interval} sub_interval={sub_interval}", extra=log_extras)
if min_dt is not None:
f_recent = df.index >= min_dt
f_repair_rows = f_repair_rows & f_recent
if not f_repair_rows.any():
msg = f"Too old ({np.sum(f_repair.any(axis=1))} rows tagged)"
logger.info(msg, extra=log_extras)
if "Repaired?" not in df.columns:
df["Repaired?"] = False
return df
dts_to_repair = df.index[f_repair_rows]
if len(dts_to_repair) == 0:
logger.debug("Nothing needs repairing (dts_to_repair[] empty)", extra=log_extras)
if "Repaired?" not in df.columns:
df["Repaired?"] = False
return df
df_v2 = df.copy()
if "Repaired?" not in df_v2.columns:
df_v2["Repaired?"] = False
f_good = ~(df[price_cols].isna().any(axis=1))
f_good = f_good & (df[price_cols].to_numpy() != tag).all(axis=1)
df_good = df[f_good]
# Group nearby NaN-intervals together to reduce number of Yahoo fetches
dts_groups = [[dts_to_repair[0]]]
# Note on setting max size: have to allow space for adding good data
if sub_interval == "1mo":
grp_max_size = _dateutil.relativedelta.relativedelta(years=2)
elif sub_interval == "1wk":
grp_max_size = _dateutil.relativedelta.relativedelta(years=2)
elif sub_interval == "1d":
grp_max_size = _dateutil.relativedelta.relativedelta(years=2)
elif sub_interval == "1h":
grp_max_size = _dateutil.relativedelta.relativedelta(years=1)
elif sub_interval == "1m":
grp_max_size = _datetime.timedelta(days=5) # allow 2 days for buffer below
else:
grp_max_size = _datetime.timedelta(days=30)
logger.debug(f"grp_max_size = {grp_max_size}", extra=log_extras)
for i in range(1, len(dts_to_repair)):
dt = dts_to_repair[i]
if dt.date() < dts_groups[-1][0].date() + grp_max_size:
dts_groups[-1].append(dt)
else:
dts_groups.append([dt])
logger.debug("Repair groups:", extra=log_extras)
for g in dts_groups:
logger.debug(f"- {g[0]} -> {g[-1]}")
# Add some good data to each group, so can calibrate prices later:
for i in range(len(dts_groups)):
g = dts_groups[i]
g0 = g[0]
i0 = df_good.index.get_indexer([g0], method="nearest")[0]
if i0 > 0:
if (min_dt is None or df_good.index[i0 - 1] >= min_dt) and \
((not intraday) or df_good.index[i0 - 1].date() == g0.date()):
i0 -= 1
gl = g[-1]
il = df_good.index.get_indexer([gl], method="nearest")[0]
if il < len(df_good) - 1:
if (not intraday) or df_good.index[il + 1].date() == gl.date():
il += 1
good_dts = df_good.index[i0:il + 1]
dts_groups[i] += good_dts.to_list()
dts_groups[i].sort()
n_fixed = 0
for g in dts_groups:
df_block = df[df.index.isin(g)]
logger.debug("df_block:\n" + str(df_block))
start_dt = g[0]
start_d = start_dt.date()
reject = False
if sub_interval == "1h" and (_datetime.date.today() - start_d) > _datetime.timedelta(days=729):
reject = True
elif sub_interval in ["30m", "15m"] and (_datetime.date.today() - start_d) > _datetime.timedelta(days=59):
reject = True
if reject:
# Don't bother requesting more price data, Yahoo will reject
msg = f"Cannot reconstruct block starting {start_dt if intraday else start_d}, too old, Yahoo will reject request for finer-grain data"
logger.info(msg, extra=log_extras)
continue
td_1d = _datetime.timedelta(days=1)
if interval in "1wk":
fetch_start = start_d - td_range # need previous week too
fetch_end = g[-1].date() + td_range
elif interval == "1d":
fetch_start = start_d
fetch_end = g[-1].date() + td_range
else:
fetch_start = g[0]
fetch_end = g[-1] + td_range
# The first and last day returned by Yahoo can be slightly wrong, so add buffer:
fetch_start -= td_1d
fetch_end += td_1d
if intraday:
fetch_start = fetch_start.date()
fetch_end = fetch_end.date() + td_1d
if min_dt is not None:
fetch_start = max(min_dt.date(), fetch_start)
logger.debug(f"Fetching {sub_interval} prepost={prepost} {fetch_start}->{fetch_end}", extra=log_extras)
# Temp disable errors printing
logger = utils.get_yf_logger()
if hasattr(logger, 'level'):
# YF's custom indented logger doesn't expose level
log_level = logger.level
logger.setLevel(logging.CRITICAL)
df_fine = self.history(start=fetch_start, end=fetch_end, interval=sub_interval, auto_adjust=False, actions=True, prepost=prepost, repair=True, keepna=True)
if hasattr(logger, 'level'):
logger.setLevel(log_level)
if df_fine is None or df_fine.empty:
msg = f"Cannot reconstruct block starting {start_dt if intraday else start_d}, too old, Yahoo will reject request for finer-grain data"
logger.info(msg, extra=log_extras)
continue
# Discard the buffer
df_fine = df_fine.loc[g[0]: g[-1] + itds[sub_interval] - _datetime.timedelta(milliseconds=1)].copy()
if df_fine.empty:
msg = f"Cannot reconstruct {interval} block range {start_dt if intraday else start_d}, Yahoo not returning finer-grain data within range"
logger.info(msg, extra=log_extras)
continue
df_fine["ctr"] = 0
if interval == "1wk":
weekdays = ["MON", "TUE", "WED", "THU", "FRI", "SAT", "SUN"]
week_end_day = weekdays[(df_block.index[0].weekday() + 7 - 1) % 7]
df_fine["Week Start"] = df_fine.index.tz_localize(None).to_period("W-" + week_end_day).start_time
grp_col = "Week Start"
elif interval == "1d":
df_fine["Day Start"] = pd.to_datetime(df_fine.index.date)
grp_col = "Day Start"
else:
df_fine.loc[df_fine.index.isin(df_block.index), "ctr"] = 1
df_fine["intervalID"] = df_fine["ctr"].cumsum()
df_fine = df_fine.drop("ctr", axis=1)
grp_col = "intervalID"
df_fine = df_fine[~df_fine[price_cols + ['Dividends']].isna().all(axis=1)]
df_fine_grp = df_fine.groupby(grp_col)
df_new = df_fine_grp.agg(
Open=("Open", "first"),
Close=("Close", "last"),
AdjClose=("Adj Close", "last"),
Low=("Low", "min"),
High=("High", "max"),
Dividends=("Dividends", "sum"),
Volume=("Volume", "sum")).rename(columns={"AdjClose": "Adj Close"})
if grp_col in ["Week Start", "Day Start"]:
df_new.index = df_new.index.tz_localize(df_fine.index.tz)
else:
df_fine["diff"] = df_fine["intervalID"].diff()
new_index = np.append([df_fine.index[0]], df_fine.index[df_fine["intervalID"].diff() > 0])
df_new.index = new_index
logger.debug('df_new:' + '\n' + str(df_new))
# Calibrate!
common_index = np.intersect1d(df_block.index, df_new.index)
if len(common_index) == 0:
# Can't calibrate so don't attempt repair
msg = f"Can't calibrate {interval} block starting {start_d} so aborting repair"
logger.info(msg, extra=log_extras)
continue
# First, attempt to calibrate the 'Adj Close' column. OK if cannot.
# Only necessary for 1d interval, because the 1h data is not div-adjusted.
if interval == '1d':
df_new_calib = df_new[df_new.index.isin(common_index)]
df_block_calib = df_block[df_block.index.isin(common_index)]
f_tag = df_block_calib['Adj Close'] == tag
if f_tag.any():
div_adjusts = df_block_calib['Adj Close'] / df_block_calib['Close']
# The loop below assumes each 1d repair is isolated, i.e. surrounded by
# good data. Which is case most of time.
# But in case are repairing a chunk of bad 1d data, back/forward-fill the
# good div-adjustments - not perfect, but a good backup.
div_adjusts[f_tag] = np.nan
if not div_adjusts.isna().all():
# Need some real values to calibrate
div_adjusts = div_adjusts.ffill().bfill()
for idx in np.where(f_tag)[0]:
dt = df_new_calib.index[idx]
n = len(div_adjusts)
if df_new.loc[dt, "Dividends"] != 0:
if idx < n - 1:
# Easy, take div-adjustment from next-day
div_adjusts.iloc[idx] = div_adjusts.iloc[idx + 1]
else:
# Take previous-day div-adjustment and reverse todays adjustment
div_adj = 1.0 - df_new_calib["Dividends"].iloc[idx] / df_new_calib['Close'].iloc[
idx - 1]
div_adjusts.iloc[idx] = div_adjusts.iloc[idx - 1] / div_adj
else:
if idx > 0:
# Easy, take div-adjustment from previous-day
div_adjusts.iloc[idx] = div_adjusts.iloc[idx - 1]
else:
# Must take next-day div-adjustment
div_adjusts.iloc[idx] = div_adjusts.iloc[idx + 1]
if df_new_calib["Dividends"].iloc[idx + 1] != 0:
div_adjusts.iloc[idx] *= 1.0 - df_new_calib["Dividends"].iloc[idx + 1] / \
df_new_calib['Close'].iloc[idx]
f_close_bad = df_block_calib['Close'] == tag
div_adjusts = div_adjusts.reindex(df_block.index, fill_value=np.nan).ffill().bfill()
df_new['Adj Close'] = df_block['Close'] * div_adjusts
if f_close_bad.any():
f_close_bad_new = f_close_bad.reindex(df_new.index, fill_value=False)
div_adjusts_new = div_adjusts.reindex(df_new.index, fill_value=np.nan).ffill().bfill()
div_adjusts_new_np = f_close_bad_new.to_numpy()
df_new.loc[div_adjusts_new_np, 'Adj Close'] = df_new['Close'][div_adjusts_new_np] * div_adjusts_new[div_adjusts_new_np]
# Check whether 'df_fine' has different split-adjustment.
# If different, then adjust to match 'df'
calib_cols = ['Open', 'Close']
df_new_calib = df_new[df_new.index.isin(common_index)][calib_cols].to_numpy()
df_block_calib = df_block[df_block.index.isin(common_index)][calib_cols].to_numpy()
calib_filter = (df_block_calib != tag)
calib_filter = calib_filter & (~np.isnan(df_new_calib))
if not calib_filter.any():
# Can't calibrate so don't attempt repair
logger.info(f"Can't calibrate block starting {start_d} so aborting repair", extra=log_extras)
continue
# Avoid divide-by-zero warnings:
for j in range(len(calib_cols)):
f = ~calib_filter[:, j]
if f.any():
if not df_block_calib.flags.writeable:
df_block_calib = df_block_calib.copy()
if not df_new_calib.flags.writeable:
df_new_calib = df_new_calib.copy()
df_block_calib[f, j] = 1
df_new_calib[f, j] = 1
ratios = df_block_calib[calib_filter] / df_new_calib[calib_filter]
weights = df_fine_grp.size()
weights.index = df_new.index
weights = weights[weights.index.isin(common_index)].to_numpy().astype(float)
if not weights.flags.writeable:
weights = weights.copy()
weights = weights[:, None] # transpose
weights = np.tile(weights, len(calib_cols)) # 1D -> 2D
weights = weights[calib_filter] # flatten
not1 = ~np.isclose(ratios, 1.0, rtol=0.00001)
if np.sum(not1) == len(calib_cols):
# Only 1 calibration row in df_new is different to df_block so ignore
ratio = 1.0
else:
ratio = np.average(ratios, weights=weights)
if abs(ratio/0.0001 -1) < 0.01:
# ratio almost-equal 0.0001, so looks like Yahoo messed up currency unit.
# E.g. £ with pence. Can correct it.
df_block = df_block.copy()
for c in _PRICE_COLNAMES_:
df_v2.loc[df_v2[c]!=tag, c] *= 100
ratio *= 100
logger.debug(f"Price calibration ratio (raw) = {ratio:6f}", extra=log_extras)
ratio_rcp = round(1.0 / ratio, 1)
ratio = round(ratio, 1)
if ratio == 1 and ratio_rcp == 1:
# Good!
pass
else:
if ratio > 1:
# data has different split-adjustment than fine-grained data
# Adjust fine-grained to match
df_new[price_cols] *= ratio
df_new["Volume"] /= ratio
elif ratio_rcp > 1:
# data has different split-adjustment than fine-grained data
# Adjust fine-grained to match
df_new[price_cols] *= 1.0 / ratio_rcp
df_new["Volume"] *= ratio_rcp
# Repair!
bad_dts = df_block.index[(df_block[price_cols + ["Volume"]] == tag).to_numpy().any(axis=1)]
no_fine_data_dts = []
for idx in bad_dts:
if idx not in df_new.index:
# Yahoo didn't return finer-grain data for this interval,
# so probably no trading happened.
no_fine_data_dts.append(idx)
if len(no_fine_data_dts) > 0:
logger.debug("Yahoo didn't return finer-grain data for these intervals: " + str(no_fine_data_dts), extra=log_extras)
for idx in bad_dts:
if idx not in df_new.index:
# Yahoo didn't return finer-grain data for this interval,
# so probably no trading happened.
continue
df_new_row = df_new.loc[idx]
if interval == "1wk":
df_last_week = df_new.iloc[df_new.index.get_loc(idx) - 1]
df_fine = df_fine.loc[idx:]
df_bad_row = df.loc[idx]
bad_fields = df_bad_row.index[df_bad_row == tag].to_numpy()
if "High" in bad_fields:
df_v2.loc[idx, "High"] = df_new_row["High"]
if "Low" in bad_fields:
df_v2.loc[idx, "Low"] = df_new_row["Low"]
if "Open" in bad_fields:
if interval == "1wk" and idx != df_fine.index[0]:
# Exchange closed Monday. In this case, Yahoo sets Open to last week close
df_v2.loc[idx, "Open"] = df_last_week["Close"]
df_v2.loc[idx, "Low"] = min(df_v2.loc[idx, "Open"], df_v2.loc[idx, "Low"])
else:
df_v2.loc[idx, "Open"] = df_new_row["Open"]
if "Close" in bad_fields:
df_v2.loc[idx, "Close"] = df_new_row["Close"]
# Assume 'Adj Close' also corrupted, easier than detecting whether true
df_v2.loc[idx, "Adj Close"] = df_new_row["Adj Close"]
elif "Adj Close" in bad_fields:
df_v2.loc[idx, "Adj Close"] = df_new_row["Adj Close"]
if "Volume" in bad_fields:
df_v2.loc[idx, "Volume"] = df_new_row["Volume"].round().astype('int')
df_v2.loc[idx, "Repaired?"] = True
n_fixed += 1
# Not logging these reconstructions - that's job of calling function as it has context.
return df_v2
def _standardise_currency(self, df, currency):
if currency not in ["GBp", "ZAc", "ILA"]:
return df, currency
currency2 = currency
if currency == 'GBp':
# UK £/pence
currency2 = 'GBP'
m = 0.01
elif currency == 'ZAc':
# South Africa Rand/cents
currency2 = 'ZAR'
m = 0.01
elif currency == 'ILA':
# Israel Shekels/Agora
currency2 = 'ILS'
m = 0.01
# Use latest row with actual volume, because volume=0 rows can be 0.01x the other rows.
# _fix_unit_switch() will ensure all rows are on same scale.
f_volume = df['Volume']>0
if not f_volume.any():
return df, currency
last_row = df.iloc[np.where(f_volume)[0][-1]]
prices_in_subunits = True # usually is true
if last_row.name > (pd.Timestamp.now('UTC') - _datetime.timedelta(days=30)):
try:
ratio = self._history_metadata['regularMarketPrice'] / last_row['Close']
if abs((ratio*m)-1) < 0.1:
# within 10% of 100x
prices_in_subunits = False
except Exception:
# Should never happen but just-in-case
if not YfConfig.debug.hide_exceptions:
raise
pass
if prices_in_subunits:
for c in _PRICE_COLNAMES_:
df[c] *= m
self._history_metadata["currency"] = currency2
self._history_metadata["currencyRepaired"] = True
f_div = df['Dividends']!=0.0
if f_div.any():
# But sometimes the dividend was in pence.
# Heuristic is: if dividend yield is ridiculous high vs converted prices, then
# assume dividend was also in pence and convert to GBP.
# Threshold for "ridiculous" based on largest yield I've seen anywhere - 63.4%
# If this simple heuristic generates a false positive, then _fix_bad_div_adjust()
# will detect and repair.
divs = df[['Close','Dividends']].copy()
divs['Close'] = divs['Close'].ffill().shift(1, fill_value=divs['Close'].iloc[0])
divs = divs[f_div]
div_pcts = (divs['Dividends'] / divs['Close']).to_numpy()
if len(div_pcts) > 0 and np.average(div_pcts) > 1:
df['Dividends'] *= m
return df, currency2
def _dividends_convert_fx(self, dividends, fx, repair=False):
bad_div_currencies = [c for c in dividends['currency'].unique() if c != fx]
major_currencies = ['USD', 'JPY', 'EUR', 'CNY', 'GBP', 'CAD']
for c in bad_div_currencies:
fx2_tkr = None
if c == 'USD':
# Simple convert from USD to target FX
fx_tkr = f'{fx}=X'
reverse = False
elif fx == 'USD':
# Use same USD FX but reversed
fx_tkr = f'{fx}=X'
reverse = True
elif c in major_currencies and fx in major_currencies:
# Simple convert
fx_tkr = f'{c}{fx}=X'
reverse = False
else:
# No guarantee that Yahoo has direct FX conversion, so
# convert via USD
# - step 1: -> USD
fx_tkr = f'{c}=X'
reverse = True
# - step 2: USD -> FX
fx2_tkr = f'{fx}=X'
fx_dat = PriceHistory(self._data, fx_tkr, self.session)
fx_rate = fx_dat.history(period='1mo', repair=repair)['Close'].iloc[-1]
if reverse:
fx_rate = 1/fx_rate
dividends.loc[dividends['currency']==c, 'Dividends'] *= fx_rate
if fx2_tkr is not None:
fx2_dat = PriceHistory(self._data, fx2_tkr, self.session)
fx2_rate = fx2_dat.history(period='1mo', repair=repair)['Close'].iloc[-1]
dividends.loc[dividends['currency']==c, 'Dividends'] *= fx2_rate
dividends['currency'] = fx
return dividends
@utils.log_indent_decorator
def _fix_unit_mixups(self, df, interval, tz_exchange, prepost):
if df.empty:
return df
df2 = self._fix_unit_switch(df, interval, tz_exchange)
df3 = self._fix_unit_random_mixups(df2, interval, tz_exchange, prepost)
return df3
@utils.log_indent_decorator
def _fix_unit_random_mixups(self, df, interval, tz_exchange, prepost):
# Sometimes Yahoo returns few prices in cents/pence instead of $/£
# I.e. 100x bigger
# 2 ways this manifests:
# - random 100x errors spread throughout table
# - a sudden switch between $<->cents at some date
# This function fixes the first.
if df.empty:
return df
# Easy to detect and fix, just look for outliers = ~100x local median
logger = utils.get_yf_logger()
log_extras = {'yf_cat': 'price-repair-100x', 'yf_interval': interval, 'yf_symbol': self.ticker}
if df.shape[0] == 0:
if "Repaired?" not in df.columns:
df["Repaired?"] = False
return df
if df.shape[0] == 1:
# Need multiple rows to confidently identify outliers
logger.debug("Cannot check single-row table for 100x price errors", extra=log_extras)
if "Repaired?" not in df.columns:
df["Repaired?"] = False
return df
df2 = df.copy()
if df2.index.tz is None:
df2.index = df2.index.tz_localize(tz_exchange)
elif df2.index.tz != tz_exchange:
df2.index = df2.index.tz_convert(tz_exchange)
# Only import scipy if users actually want function. To avoid
# adding it to dependencies.
from scipy import ndimage as _ndimage
data_cols = ["High", "Open", "Low", "Close", "Adj Close"] # Order important, separate High from Low
data_cols = [c for c in data_cols if c in df2.columns]
f_zeroes = (df2[data_cols] == 0).any(axis=1).to_numpy()
if f_zeroes.any():
df2_zeroes = df2[f_zeroes]
df2 = df2[~f_zeroes]
df_orig = df[~f_zeroes] # all row slicing must be applied to both df and df2
else:
df2_zeroes = None
df_orig = df
if df2.shape[0] <= 1:
logger.info("Insufficient good data for detecting 100x price errors", extra=log_extras)
if "Repaired?" not in df.columns:
df["Repaired?"] = False
return df
df2_data = df2[data_cols].to_numpy()
median = _ndimage.median_filter(df2_data, size=(3, 3), mode="wrap")
ratio = df2_data / median
ratio_rounded = (ratio / 20).round() * 20 # round ratio to nearest 20
f = ratio_rounded == 100
ratio_rcp = 1.0/ratio
ratio_rcp_rounded = (ratio_rcp / 20).round() * 20 # round ratio to nearest 20
f_rcp = (ratio_rounded == 100) | (ratio_rcp_rounded == 100)
f_either = f | f_rcp
if not f_either.any():
logger.debug("No sporadic 100x errors", extra=log_extras)
if "Repaired?" not in df.columns:
df["Repaired?"] = False
return df
# Mark values to send for repair
tag = -1.0
for i in range(len(data_cols)):
fi = f_either[:, i]
c = data_cols[i]
df2.loc[fi, c] = tag
n_before = (df2_data == tag).sum()
df2 = self._reconstruct_intervals_batch(df2, interval, prepost, tag)
df2_tagged = df2[data_cols].to_numpy() == tag
n_after = (df2[data_cols].to_numpy() == tag).sum()
if n_after > 0:
# This second pass will *crudely* "fix" any remaining errors in High/Low
# simply by ensuring they don't contradict e.g. Low = 100x High.
f = (df2[data_cols].to_numpy() == tag) & f
for i in range(f.shape[0]):
fi = f[i, :]
if not fi.any():
continue
idx = df2.index[i]
for c in ['Open', 'Close']:
j = data_cols.index(c)
if fi[j]:
df2.loc[idx, c] = df.loc[idx, c] * 0.01
c = "High"
j = data_cols.index(c)
if fi[j]:
df2.loc[idx, c] = df2.loc[idx, ["Open", "Close"]].max()
c = "Low"
j = data_cols.index(c)
if fi[j]:
df2.loc[idx, c] = df2.loc[idx, ["Open", "Close"]].min()
f_rcp = (df2[data_cols].to_numpy() == tag) & f_rcp
for i in range(f_rcp.shape[0]):
fi = f_rcp[i, :]
if not fi.any():
continue
idx = df2.index[i]
for c in ['Open', 'Close']:
j = data_cols.index(c)
if fi[j]:
df2.loc[idx, c] = df.loc[idx, c] * 100.0
c = "High"
j = data_cols.index(c)
if fi[j]:
df2.loc[idx, c] = df2.loc[idx, ["Open", "Close"]].max()
c = "Low"
j = data_cols.index(c)
if fi[j]:
df2.loc[idx, c] = df2.loc[idx, ["Open", "Close"]].min()
df2_tagged = df2[data_cols].to_numpy() == tag
n_after_crude = df2_tagged.sum()
else:
n_after_crude = n_after
n_fixed = n_before - n_after_crude
n_fixed_crudely = n_after - n_after_crude
if n_fixed > 0:
report_msg = f"fixed {n_fixed}/{n_before} currency unit mixups "
if n_fixed_crudely > 0:
report_msg += f"({n_fixed_crudely} crudely)"
logger.info(report_msg, extra=log_extras)
# Restore original values where repair failed
f_either = df2[data_cols].to_numpy() == tag
for j in range(len(data_cols)):
fj = f_either[:, j]
if fj.any():
c = data_cols[j]
df2.loc[fj, c] = df_orig.loc[fj, c]
if df2_zeroes is not None:
if "Repaired?" not in df2_zeroes.columns:
df2_zeroes["Repaired?"] = False
df2 = pd.concat([df2, df2_zeroes]).sort_index()
df2.index = pd.to_datetime(df2.index)
return df2
@utils.log_indent_decorator
def _fix_unit_switch(self, df, interval, tz_exchange):
# Sometimes Yahoo returns few prices in cents/pence instead of $/£
# I.e. 100x bigger
# 2 ways this manifests:
# - random 100x errors spread throughout table
# - a sudden switch between $<->cents at some date
# This function fixes the second.
# Eventually Yahoo fixes but could take them 2 weeks.
if self._history_metadata['currency'] == 'KWF':
# Kuwaiti Dinar divided into 1000 not 100
n = 1000
else:
n = 100
return self._fix_prices_sudden_change(df, interval, tz_exchange, n, unit_switch=True, correct_dividend=True)
@utils.log_indent_decorator
def _fix_zeroes(self, df, interval, tz_exchange, prepost):
# Sometimes Yahoo returns prices=0 or NaN when trades occurred.
# But most times when prices=0 or NaN returned is because no trades.
# Impossible to distinguish, so only attempt repair if few or rare.
if df.empty:
return df
logger = utils.get_yf_logger()
log_extras = {'yf_cat': 'price-repair-zeroes', 'yf_interval': interval, 'yf_symbol': self.ticker}
intraday = interval[-1] in ("m", 'h')
df = df.sort_index() # important!
df2 = df.copy()
if df2.index.tz is None:
df2.index = df2.index.tz_localize(tz_exchange)
elif df2.index.tz != tz_exchange:
df2.index = df2.index.tz_convert(tz_exchange)
price_cols = [c for c in _PRICE_COLNAMES_ if c in df2.columns]
f_prices_bad = (df2[price_cols] == 0.0) | df2[price_cols].isna()
df2_reserve = None
if intraday:
# Ignore days with >50% intervals containing NaNs
grp = pd.Series(f_prices_bad.any(axis=1), name="nan").groupby(f_prices_bad.index.date)
nan_pct = grp.sum() / grp.count()
dts = nan_pct.index[nan_pct > 0.5]
f_zero_or_nan_ignore = np.isin(f_prices_bad.index.date, dts)
df2_reserve = df2[f_zero_or_nan_ignore]
df2 = df2[~f_zero_or_nan_ignore]
if df2.empty:
# No good data
if 'Repaired?' not in df.columns:
df['Repaired?'] = False
return df
df2 = df2.copy()
f_prices_bad = (df2[price_cols] == 0.0) | df2[price_cols].isna()
f_change = df2["High"].to_numpy() != df2["Low"].to_numpy()
if self.ticker.endswith("=X"):
# FX, volume always 0
f_vol_bad = None
else:
f_high_low_good = (~df2["High"].isna().to_numpy()) & (~df2["Low"].isna().to_numpy())
f_vol_zero = (df2["Volume"] == 0).to_numpy()
f_vol_bad = f_vol_zero & f_high_low_good & f_change
# ^ intra-interval price changed without volume, bad
if not intraday:
# Interday data: if close changes between intervals with volume=0 then volume is wrong.
# Possible can repair with intraday, but usually Yahoo does not have the volume.
close_diff = df2['Close'].diff()
close_diff.iloc[0] = 0
close_chg_pct_abs = np.abs(close_diff / df2['Close'])
f_bad_price_chg = (close_chg_pct_abs > 0.05).to_numpy() & f_vol_zero
f_vol_bad = f_vol_bad | f_bad_price_chg
# If stock split occurred, then trading must have happened.
# I should probably rename the function, because prices aren't zero ...
if 'Stock Splits' in df2.columns:
f_split = (df2['Stock Splits'] != 0.0).to_numpy()
if f_split.any():
f_change_expected_but_missing = f_split & ~f_change
if f_change_expected_but_missing.any():
f_prices_bad[f_change_expected_but_missing] = True
# Check whether worth attempting repair
f_prices_bad = f_prices_bad.to_numpy()
f_bad_rows = f_prices_bad.any(axis=1)
if f_vol_bad is not None:
f_bad_rows = f_bad_rows | f_vol_bad
if not f_bad_rows.any():
logger.debug("No price=0 errors to repair", extra=log_extras)
if "Repaired?" not in df.columns:
df["Repaired?"] = False
return df
if f_prices_bad.sum() == len(price_cols) * len(df2):
# Need some good data to calibrate
logger.debug("No good data for calibration so cannot fix price=0 bad data", extra=log_extras)
if "Repaired?" not in df.columns:
df["Repaired?"] = False
return df
data_cols = price_cols + ["Volume"]
# Mark values to send for repair
tag = -1.0
for i in range(len(price_cols)):
c = price_cols[i]
df2.loc[f_prices_bad[:, i], c] = tag
if f_vol_bad is not None:
df2.loc[f_vol_bad, "Volume"] = tag
# If volume=0 or NaN for bad prices, then tag volume for repair
f_vol_zero_or_nan = (df2["Volume"].to_numpy() == 0) | (df2["Volume"].isna().to_numpy())
df2.loc[f_prices_bad.any(axis=1) & f_vol_zero_or_nan, "Volume"] = tag
# If volume=0 or NaN but price moved in interval, then tag volume for repair
df2.loc[f_change & f_vol_zero_or_nan, "Volume"] = tag
df2_tagged = df2[data_cols].to_numpy() == tag
n_before = df2_tagged.sum()
dts_tagged = df2.index[df2_tagged.any(axis=1)]
df2 = self._reconstruct_intervals_batch(df2, interval, prepost, tag)
df2_tagged = df2[data_cols].to_numpy() == tag
n_after = df2_tagged.sum()
dts_not_repaired = df2.index[df2_tagged.any(axis=1)]
n_fixed = n_before - n_after
if n_fixed > 0:
msg = f"{self.ticker}: fixed {n_fixed}/{n_before} value=0 errors in {interval} price data"
if n_fixed < 4:
dts_repaired = sorted(list(set(dts_tagged).difference(dts_not_repaired)))
msg += f": {dts_repaired}"
logger.debug(msg, extra=log_extras)
if df2_reserve is not None:
if "Repaired?" not in df2_reserve.columns:
df2_reserve["Repaired?"] = False
df2 = pd.concat([df2, df2_reserve]).sort_index()
# Restore original values where repair failed (i.e. remove tag values)
f = df2[data_cols].to_numpy() == tag
for j in range(len(data_cols)):
fj = f[:, j]
if fj.any():
c = data_cols[j]
df2.loc[fj, c] = df.loc[fj, c]
return df2
@utils.log_indent_decorator
def _repair_capital_gains(self, df):
# Yahoo has started double-counting capital gains in Adj Close,
# by pre-adding it to dividends column.
if 'Capital Gains' not in df.columns:
return df
if (df['Capital Gains'] == 0).all():
return df
debug = False
# debug = True
logger = utils.get_yf_logger()
log_extras = {'yf_cat': 'repair-capital-gains', 'yf_symbol': self.ticker}
df = df.copy()
df = df.sort_index()
# Consider price drop to decide if Yahoo double-counted -
# drop should = true dividend + capital gains
# But need to account for normal price volatility:
df['Price_Change%'] = df['Close'].pct_change(fill_method=None).abs()
no_distributions = (df['Dividends'] == 0) & (df['Capital Gains'] == 0)
price_drop_pct_mean = df.loc[no_distributions, 'Price_Change%'].mean()
df = df.drop('Price_Change%', axis=1)
# Add columns if not present
if 'Repaired?' not in df.columns:
df['Repaired?'] = False
df['Adj'] = df['Adj Close'] / df['Close']
if debug:
df['ScaleFactor'] = np.nan
df['correction'] = np.nan
df['AdjYahoo'] = (df['Adj Close']/df['Close']).round(4)
print(f"# price_drop_pct_mean = {price_drop_pct_mean:.4f}")
dts = df[df['Capital Gains'] > 0].index
c = df['Close'].to_numpy()
ac = df['Adj Close'].to_numpy()
dcs = {}
for dt in dts:
idx = df.index.get_loc(dt)
if idx > 0:
# Need a row before for price drop
dividend = df['Dividends'].iloc[idx]
capital_gains = df['Capital Gains'].iloc[idx]
if dividend < capital_gains:
# Not possible for 'dividend' to be including capital gains
continue
div_pct = dividend / c[idx-1]
cg_pct = capital_gains / c[idx-1]
# Check whether adjusted price drop is closer to dividend vs dividend+capital_gains
price_drop_pct = (c[idx-1] - c[idx]) / c[idx-1]
price_drop_pct_excl_vol = price_drop_pct - price_drop_pct_mean
diff_div = abs(price_drop_pct_excl_vol - div_pct)
diff_total = abs(price_drop_pct_excl_vol - (div_pct + cg_pct))
cg_is_double_counted = diff_div < diff_total
dcs[idx] = cg_is_double_counted
if debug:
print(f"# {dt.date()}: div = {div_pct*100:.1f}%, cg = {cg_pct*100:.1f}%")
print(f"- price_drop_pct = {price_drop_pct*100:.1f}%")
print(f"- price_drop_pct_excl_vol = {price_drop_pct_excl_vol*100:.1f}%")
print(f"- diff_div = {diff_div:.4f}")
print(f"- diff_total = {diff_total:.4f}")
print(f"- cg_is_double_counted = {cg_is_double_counted}")
pct_double_counted = sum(dcs.values()) / len(dcs)
if debug:
print(f"- pct_double_counted = {pct_double_counted*100:.1f}%")
if pct_double_counted >= 0.666:
for idx in dcs.keys():
dt = df.index[idx]
dividend = df['Dividends'].iloc[idx]
capital_gains = df['Capital Gains'].iloc[idx]
# Instead of calculating new adjustment from scratch,
# reverse the double-count from existing adjustment.
# In case don't have all events after last date.
dividend_true = dividend - capital_gains
df.loc[dt, 'Dividends'] = dividend_true
# Correct adjustment for dates before and including this distribution date
adj_before = (ac[idx-1]/c[idx-1]) / (ac[idx]/c[idx])
adj_correct = 1.0 - (dividend_true + capital_gains) / c[idx-1]
correction = adj_correct / adj_before
df.loc[:dt-_datetime.timedelta(1), 'Adj'] *= correction
df.loc[:dt, 'Repaired?'] = True
msg = f"Repaired capital-gains double-count at {dt.date()}. Adj correction = {correction:.4f}"
logger.info(msg, extra=log_extras)
if debug:
df.loc[dt, 'correction'] = correction
df['Adj Close'] = df['Close'] * df['Adj']
if debug:
df['Adj'] = df['Adj'].round(4)
else:
df = df.drop('Adj', axis=1)
return df
@utils.log_indent_decorator
def _fix_bad_div_adjust(self, df, interval, prepost, currency):
# Look for dividend issues:
# - dividend ~100x the Close change (a currency unit mixup)
# - dividend missing from Adj Close
# - dividend is in Adj Close but adjustment is too much, or too small
# Experimental: also detect dividend in wrong currency e.g. $ not Israel Shekels.
# But only for big FX rates, otherwise false positives from price volatility.
if df is None or df.empty:
return df
if interval in ['1wk', '1mo', '3mo', '1y']:
return df
intraday = interval[-1] in ['h', 'm']
if 'Capital Gains' in df.columns and (df['Capital Gains']>0).any():
# So there are capital gains. This function only considers dividends.
# I don't want to deal with capital gains being wrong as well!
# But if you find capital gains that need repair e.g. 100x error, then report to our Github.
return df
logger = utils.get_yf_logger()
log_extras = {'yf_cat': 'div-adjust-repair-bad', 'yf_interval': interval, 'yf_symbol': self.ticker}
f_div = (df["Dividends"] != 0.0).to_numpy()
if not f_div.any():
logger.debug('No dividends to check', extra=log_extras)
return df
if self._history_metadata['currency'] == 'KWF':
# Kuwaiti Dinar divided into 1000 not 100
currency_divide = 1000
else:
currency_divide = 100
div_status_df = None
too_big_check_threshold = 0.035
df = df.sort_index()
df2 = df.copy()
if 'Repaired?' not in df2.columns:
df2['Repaired?'] = False
df_modified = False
# Split df2 into: nan data, and non-nan data
f_nan = df2['Close'].isna().to_numpy()
df2_nan = df2[f_nan].copy()
df2 = df2[~f_nan].copy()
f_div = (df2["Dividends"] != 0.0).to_numpy()
if not f_div.any():
logger.debug('No dividends to check', extra=log_extras)
return df
div_indices = np.where(f_div)[0]
f_inf = df2['Adj Close'].isin([np.inf, -np.inf])
if f_inf.any():
# In extreme rare case (SSNLF), Yahoo adj close
# is so bad that it overflows floating-point type into Infinity.
# So reduce those massive values.
f_ninf = ~f_inf
adjClose = df2['Adj Close'].to_numpy()
close = df2['Close'].to_numpy()
close10x = close*10
f_huge = f_ninf & (adjClose > close10x)
while f_huge.any():
adjClose[f_huge] = adjClose[f_huge]*0.001
f_huge = f_ninf & (adjClose > close10x)
df2['Adj Close'] = adjClose
df2['Adj Close'] = df2['Adj Close'].replace([np.inf, -np.inf], np.nan)
df2['Adj'] = df2['Adj Close'] / df2['Close']
df2['Adj'] = df2['Adj'].bfill()
f_adjClose_na = df2['Adj Close'].isna() & (~df2['Close'].isna())
df2.loc[f_adjClose_na, 'Adj Close'] = df2['Adj'][f_adjClose_na] * df2['Close'][f_adjClose_na]
df2 = df2.drop('Adj', axis=1)
# Very rarely, the Close (not Adj Close) is already adjusted!
# Clue is it's often lower than Low.
# E.g. ticker MPCC.OL - Oslo exchange data contradicts Yahoo.
# But sometimes the original data is bad, e.g. LSE sometimes close < low
# Can attempt to fix:
fixed_dates = []
for i in range(len(div_indices)-1, -1, -1):
div_idx = div_indices[i]
if div_idx == 0:
continue
prices_before = df2.iloc[div_idx-1]
diff = prices_before['Low'] - prices_before['Close']
div = df2['Dividends'].iloc[div_idx]
if diff > 0 and (diff/div-1)<0.01:
# Close<Low and the difference not bigger than the dividend,
# because if diff > dividend then something else caused problem.
dt_before = df2.index[div_idx-1]
new_close = prices_before['Close'] + div
if new_close >= prices_before['Low'] and new_close <= prices_before['High']:
df2.loc[dt_before, 'Close'] = new_close
adj_after = df2['Adj Close'].iloc[div_idx] / df2['Close'].iloc[div_idx]
adj = adj_after * (1.0 - div/df2['Close'].iloc[div_idx-1])
df2.loc[dt_before, 'Adj Close'] = df2['Close'].iloc[div_idx-1] * adj
df2.loc[dt_before, 'Repaired?'] = True
df_modified = True
fixed_dates.append(df2.index[div_idx].date())
if len(fixed_dates) > 0:
msg = f"Repaired double-adjustment on div days {[str(d) for d in fixed_dates]}"
logger.info(msg, extra=log_extras)
# Check dividends if too big/small for the price action
for i in range(len(div_indices)-1, -1, -1):
div_idx = div_indices[i]
dt = df2.index[div_idx]
div = df2['Dividends'].iloc[div_idx]
if div_idx == 0:
continue
div_pct = div / df2['Close'].iloc[div_idx-1]
# Check if dividend is 100x market movement.
div_too_small_improvement_threshold = 1
# div_too_big_improvement_threshold = 1
div_too_big_improvement_threshold = 2
if intraday:
# Useful to also have day move (Close -> Close)
df2_day = df2.loc[str(dt.date())].copy()
df2_day = self._resample(df2_day, interval, '1d')
if isclose(df2['Low'].iloc[div_idx], df2['Close'].iloc[div_idx-1]*100, rel_tol = 0.025):
# Price has jumped ~100x on ex-div day, need to fix immediately.
drop = df2['Close'].iloc[div_idx-1]*100 - df2['Low'].iloc[div_idx]
div_pct = div / (df2['Close'].iloc[div_idx-1]*100)
true_adjust = 1.0 - div / (df2['Close'].iloc[div_idx-1]*100)
present_adj = df2['Adj Close'].iloc[div_idx-1] / df2['Close'].iloc[div_idx-1]
if not isclose(present_adj, true_adjust, rel_tol = 0.025):
df2.loc[:dt-_datetime.timedelta(seconds=1), 'Adj Close'] = true_adjust * df2['Close'].loc[:dt-_datetime.timedelta(seconds=1)]
df2.loc[:dt-_datetime.timedelta(seconds=1), 'Repaired?'] = True
if intraday:
day_move = df2['Close'].iloc[div_idx-1]*100 - df2_day['Close'].iloc[0]
elif isclose(df2['Low'].iloc[div_idx], df2['Close'].iloc[div_idx-1]*0.01, rel_tol = 0.025):
# Price has dropped ~100x on ex-div day, need to fix immediately.
drop = df2['Close'].iloc[div_idx-1]*0.01 - df2['Low'].iloc[div_idx]
div_pct = div / (df2['Close'].iloc[div_idx-1]*0.01)
true_adjust = 1.0 - div / (df2['Close'].iloc[div_idx-1]*100)
present_adj = df2['Adj Close'].iloc[div_idx-1] / df2['Close'].iloc[div_idx-1]
if not isclose(present_adj, true_adjust, rel_tol = 0.025):
df2.loc[:dt-_datetime.timedelta(seconds=1), 'Adj Close'] = true_adjust * df2['Close'].loc[:dt-_datetime.timedelta(seconds=1)]
df2.loc[:dt-_datetime.timedelta(seconds=1), 'Repaired?'] = True
if intraday:
day_move = df2['Close'].iloc[div_idx-1]*0.01 - df2_day['Close'].iloc[0]
else:
drop = df2['Close'].iloc[div_idx-1] - df2['Low'].iloc[div_idx]
if intraday:
day_move = df2['Close'].iloc[div_idx-1] - df2_day['Close'].iloc[0]
if div_idx < len(df2)-1:
# # In low-volume scenarios, the price drop is day after not today.
# if df2['Close'].iloc[div_idx-1] == df2['Close'].iloc[div_idx] or \
# df2['Low'].iloc[div_idx] == df2['High'].iloc[div_idx]:
# drop = np.max(df2['Close'].iloc[div_idx-1:div_idx+1].to_numpy() - df2['Low'].iloc[div_idx:div_idx+2].to_numpy())
# elif df2['Volume'].iloc[div_idx]==0:
# if drop == 0.0:
# drop = np.max(df2['Close'].iloc[div_idx-1:div_idx+1].to_numpy() - df2['Low'].iloc[div_idx:div_idx+2].to_numpy())
#
# Hmm, can I always look ahead 1 day? Catch: increases FP rate of div-too-small for tiny divs.
# drops = df2['Close'].iloc[div_idx-1:div_idx+1].to_numpy() - df2['Low'].iloc[div_idx:div_idx+2].to_numpy()
drops = np.array([drop, df2['Close'].iloc[div_idx] - df2['Low'].iloc[div_idx+1]])
drop_2Dmax = np.max(drops)
else:
drops = np.array([drop])
drop_2Dmax = drop
if (len(df2)-div_idx) < 4:
end = min(len(df2), div_idx+4)
start = max(0, end-8)
else:
start = max(0, div_idx-4)
end = min(len(df2), start+8)
if end-start < 4:
# Not enough data to estimate volatility
typical_volatility = np.nan
else:
diffs = df2['Close'].iloc[start:end-1].to_numpy() - df2['Low'].iloc[start+1:end].to_numpy()
typical_volatility = np.mean(np.abs(diffs))
possibilities = []
if (drops==0.0).all() and df2['Volume'].iloc[div_idx]==0:
# Can't analyse price action so use crude heuristics
pct_zero_vol = np.sum(df2['Volume']==0.0)/len(df2)
if div_pct*100 < 0.1:
# Could be a 0.01x error
possibilities.append({'state':'div-too-small', 'diff':0.0})
# elif div_pct > 1.0:
# Update: lower threshold for illiquid stocks, because why paying mega dividends?
elif (pct_zero_vol > 0.75 and div_pct > 0.25) or (div_pct > 1.0):
# Could be a 100x error
possibilities.append({'state':'div-too-big', 'diff':0.0})
else:
split = df2['Stock Splits'].loc[dt]
if split == 0.0:
div_postSplit = None
else:
# Maybe Yahoo has not applied coincident split to dividend
div_postSplit = div / split
if div_postSplit > div:
# Use volatility-adjusted drop
_drop = drop - typical_volatility
else:
_drop = drop_2Dmax
if _drop > 0:
diff = abs(div-_drop)
diff_postSplit = abs(div_postSplit-_drop)
if (diff_postSplit * div_too_big_improvement_threshold) <= diff:
possibilities.append({'state':'div-pre-split', 'diff':diff_postSplit})
# Check for div-too-big
if div_pct > too_big_check_threshold:
if drop_2Dmax <= 0.0:
possibilities.append({'state':'div-too-big', 'diff':0.0})
else:
diff = abs(div-drop_2Dmax)
diff_fx = abs((div/currency_divide)-drop_2Dmax)
if div_postSplit is None:
if (diff_fx * div_too_big_improvement_threshold) <= diff:
possibilities.append({'state':'div-too-big', 'diff':diff_fx})
else:
diff_fxPostSplit = abs((div_postSplit/currency_divide)-drop_2Dmax)
if diff_fx < diff_fxPostSplit:
if (diff_fx * div_too_big_improvement_threshold) <= diff:
possibilities.append({'state':'div-too-big', 'diff':diff_fx})
else:
if (diff_fxPostSplit * div_too_big_improvement_threshold) <= diff:
possibilities.append({'state':'div-too-big-and-pre-split', 'diff':diff_fxPostSplit})
# Check for div-too-small - can be tricked by normal price volatility
if not np.isnan(typical_volatility):
# drop_wo_vol = drop_2Dmax - typical_volatility
# Update: only use same-day change for too-small, to reduce false-positives
drop_wo_vol = drop - typical_volatility
if drop_wo_vol > 0 and intraday and prepost:
# First, check if pre/post silly games
if day_move < 0.2*drop_wo_vol:
# Price recovered by end of trading session,
# so class this as false positive
drop_wo_vol = 0
if drop_wo_vol > 0:
diff = abs(div-drop_wo_vol)
diff_fx = abs((div*currency_divide)-drop_wo_vol)
if div_postSplit is None:
if (diff_fx * div_too_small_improvement_threshold) <= diff:
possibilities.append({'state':'div-too-small', 'diff':diff_fx})
else:
diff_fxPostSplit = abs((div_postSplit*currency_divide)-drop_wo_vol)
if diff_fx < diff_fxPostSplit:
if (diff_fx * div_too_big_improvement_threshold) <= diff:
possibilities.append({'state':'div-too-small', 'diff':diff_fx})
else:
if (diff_fxPostSplit * div_too_big_improvement_threshold) <= diff:
possibilities.append({'state':'div-too-small-and-pre-split', 'diff':diff_fxPostSplit})
div_status = {'date': dt, 'idx':div_idx, 'div': div, '%': div_pct}
div_status['drop'] = drop
div_status['drop_2Dmax'] = drop_2Dmax
div_status['volume'] = df2['Volume'].iloc[div_idx]
div_status['vol'] = typical_volatility
div_status['div_too_big'] = False
div_status['div_too_small'] = False
div_status['div_pre_split'] = False
div_status['div_too_big_and_pre_split'] = False
div_status['div_too_small_and_pre_split'] = False
if len(possibilities) > 0:
# Something is wrong with dividend - pick the best correction
possibilities = sorted(possibilities, key=lambda k: k['diff'])
p = possibilities[0]
div_status[p['state'].replace('-', '_')] = True
row = pd.DataFrame([div_status]).set_index('date')
if div_status_df is None:
div_status_df = row
else:
div_status_df = pd.concat([div_status_df, row])
if div_status_df is None and not df_modified:
return df
checks = [c for c in div_status_df.columns if c.startswith('div_')]
div_status_df = div_status_df.sort_index()
def cluster_dividends(df, column='div', threshold=7):
n = len(df)
sorted_df = df.sort_values(column)
clusters = []
current_dts = [sorted_df.index[0]]
currents_vals = [sorted_df[column].iloc[0]]
for i in range(1, n):
dt = sorted_df.index[i]
div = sorted_df[column].iloc[i]
if (div / np.mean(currents_vals)) < threshold:
# Add
current_dts.append(dt)
currents_vals.append(div)
else:
# New cluster
clusters.append(current_dts)
current_dts = [dt]
currents_vals = [div]
clusters.append(current_dts)
cluster_labels = np.array([-1]*n)
ctr = 0
for i, cluster in enumerate(clusters):
nc = len(cluster)
cluster_labels[ctr:ctr+nc] = i
ctr += nc
return cluster_labels
# Check if the present div-adjustment is too big/small, or missing
# - too-big determined from Adj Close movement vs Close
# - too-small compares Adj Close vs dividends
for i in range(len(div_status_df)):
div_idx = div_status_df['idx'].iloc[i]
dt = div_status_df.index[i]
div = div_status_df['div'].iloc[i]
if div_idx == 0:
continue
div_pct = div / df2['Close'].iloc[div_idx-1]
# First, check if Yahoo failed to apply dividend to Adj Close
pre_adj = df2['Adj Close'].iloc[div_idx-1] / df2['Close'].iloc[div_idx-1]
post_adj = df2['Adj Close'].iloc[div_idx] / df2['Close'].iloc[div_idx]
div_missing_from_adjclose = post_adj == pre_adj
# Check if adjustment too small
present_adj = pre_adj / post_adj
implied_div_yield = 1.0 - present_adj
div_adj_is_too_small = implied_div_yield < (0.1*div_pct)
# ... and use same method for adjustment too big:
div_adj_exceeds_div = implied_div_yield > (10*div_pct)
# Can prune the space:
if div_missing_from_adjclose:
div_adj_is_too_small = False # redundant information
div_status = {'present adj': present_adj}
div_status['adj_missing'] = div_missing_from_adjclose
div_status['adj_exceeds_div'] = div_adj_exceeds_div
div_status['div_exceeds_adj'] = div_adj_is_too_small
for k,v in div_status.items():
if k not in div_status_df:
if isinstance(v, (bool, np.bool_)):
div_status_df[k] = False
elif isinstance(v, int):
div_status_df[k] = 0
elif isinstance(v, float):
div_status_df[k] = 0.0
# elif k == 'div_true_date':
# div_status_df[k] = pd.Series(dtype='datetime64[ns, UTC]')
else:
raise ValueError(k,v,type(v))
div_status_df.loc[dt, k] = v
checks += ['adj_missing', 'adj_exceeds_div', 'div_exceeds_adj']
div_status_df['phantom'] = False
phantom_proximity_threshold = _datetime.timedelta(days=17)
f = div_status_df[['div_too_big', 'div_exceeds_adj']].any(axis=1)
if f.any() and len(div_status_df) > 1:
# One/some of these may be phantom dividends. Clue is if another correct dividend is very close
indices = np.where(f)[0]
dts_to_check = div_status_df.index[f]
for i in indices:
div = div_status_df.iloc[i]
div_dt = div.name
phantom_dt = None
if i > 0:
other_div = div_status_df.iloc[i-1]
else:
other_div = div_status_df.iloc[i+1]
ratio1 = (div['div']/currency_divide) / other_div['div']
ratio2 = div['div'] / other_div['div']
divergence = min(abs(ratio1-1.0), abs(ratio2-1.0))
if abs(div_dt-other_div.name) <= phantom_proximity_threshold and not other_div['phantom'] and divergence < 0.01:
if other_div.name in dts_to_check:
# Both this and previous are anomalous, so mark smallest drop as phantom
drop = div['drop']
drop_next = other_div['drop']
if drop > 1.5*drop_next:
phantom_dt = other_div.name
else:
phantom_dt = div_dt
else:
phantom_dt = div_dt
if phantom_dt:
div_status_df.loc[phantom_dt, 'phantom'] = True
for c in checks:
if c in div_status_df.columns:
div_status_df.loc[phantom_dt, c] = False
# There might be other phantom dividends - in close proximity and almost-equal to another div.
# But harder to decide which is the phantom and which is real.
# Assume phantom has much smaller price drop, otherwise assume is newer.
# ratio_threshold = 0.01
ratio_threshold = 0.08 # increased for KAP.IL 2022-July
div_status_df = div_status_df.sort_index()
for i in range(1, len(div_status_df)):
div = div_status_df.iloc[i]
div_dt = div.name
this_is_phantom = False
last_is_phantom = False
drop = div['drop']
last_div = div_status_df.iloc[i-1]
ratio = div['div'] / last_div['div']
if abs(div_dt-last_div.name) <= phantom_proximity_threshold and not last_div['phantom'] and not div['phantom'] and abs(ratio-1.0) < ratio_threshold:
last_drop = div_status_df['drop'].iloc[i-1]
if drop > 1.5*last_drop:
last_is_phantom = True
else:
this_is_phantom = True
if last_is_phantom or this_is_phantom:
phantom_div_dt = div_dt if this_is_phantom else div_status_df.index[i-1]
div_status_df.loc[phantom_div_dt, 'phantom'] = True
for c in checks:
if c in div_status_df.columns:
div_status_df.loc[phantom_div_dt, c] = False
checks.append('phantom')
# Remove phantoms early
if 'phantom' in div_status_df.columns:
f_phantom = div_status_df['phantom']
# ... but only if no other problems
f_phantom = f_phantom & (~div_status_df[[c for c in checks if c != 'phantom']].any(axis=1))
if f_phantom.any():
div_dts = div_status_df.index[f_phantom]
msg = f'Removing phantom div(s): {[str(dt.date()) for dt in div_dts]}'
logger.info(msg, extra=log_extras)
phantom_div_dts = div_status_df.index[f_phantom]
for dt in phantom_div_dts:
enddt = dt-_datetime.timedelta(seconds=1)
df2.loc[ :enddt, 'Adj Close'] /= div_status_df['present adj'].loc[dt]
df2.loc[ :enddt, 'Repaired?'] = True
df2_nan.loc[:enddt, 'Adj Close'] /= div_status_df['present adj'].loc[dt]
df2_nan.loc[:enddt, 'Repaired?'] = True
df2.loc[dt, 'Dividends'] = 0
df_modified = True
div_status_df = div_status_df.drop(dt)
div_status_df.loc[f_phantom, 'phantom'] = False
div_status_df = div_status_df.drop('phantom', axis=1)
if 'phantom' in checks:
checks.remove('phantom')
if not div_status_df[checks].any().any():
# Maybe failed to detect a too-small div. If div is ~0.01x of previous and next, then
# treat as a 0.01x error
if len(div_status_df) > 1:
for i in range(0, len(div_status_df)):
r_pre, r_post = None, None
if i > 0:
r_pre = div_status_df['%'].iloc[i-1] / div_status_df['%'].iloc[i]
if i < (len(div_status_df)-1):
r_post = div_status_df['%'].iloc[i+1] / div_status_df['%'].iloc[i]
r_pre = r_pre or r_post
r_post = r_post or r_pre
if abs(r_pre-currency_divide)<20 and abs(r_post-currency_divide)<20:
div_dt = div_status_df.index[i]
div_status_df.loc[div_dt, 'div_too_small'] = True
if not div_status_df[checks].any().any():
# Perfect
if df_modified:
if not df2_nan.empty:
df2 = pd.concat([df2, df2_nan]).sort_index()
return df2
else:
return df
# Check if the present div-adjustment contradicts price action
for i in range(len(div_status_df)):
div_idx = div_status_df['idx'].iloc[i]
dt = div_status_df.index[i]
div = div_status_df['div'].iloc[i]
if div_idx == 0:
continue
div_pct = div / df2['Close'].iloc[div_idx-1]
# Adj Close should drop by LESS than Close on ex-div, at least for big dividends.
# Update: Yahoo might be reporting dividend slightly early, meaning
# Mr Market's price drop happens tomorrow e.g. UNTC in december 2023.
# Or worse, Yahoo is 1 month early e.g. GWI.L ex-div was mid-April not mid-March
lookahead_date = dt+_datetime.timedelta(days=35)
lookahead_idx = bisect.bisect_left(df2.index, lookahead_date)
lookahead_idx = min(lookahead_idx, len(df2)-1)
# In rare cases, the price dropped 1 day before dividend (DVD.OL @ 2024-05-15)
lookback_idx = max(0, div_idx-14)
# Check for bad stock splits in the lookahead period -
# if present, reduce lookahead to before.
future_changes = df2['Close'].iloc[div_idx:lookahead_idx+1].pct_change()
f_big_change = (future_changes > 2).to_numpy() | (future_changes < -0.9).to_numpy()
if f_big_change.any():
lookahead_idx = div_idx + np.where(f_big_change)[0][0]-1
lookahead_date = df2.index[lookahead_idx]
div_adj_exceeds_prices = False
div_date_wrong = False
div_true_date = pd.NaT
if lookahead_idx > lookback_idx:
x = df2.iloc[lookback_idx:lookahead_idx+1].copy()
x['Adj'] = x['Adj Close'] / x['Close']
x['Adj Low'] = x['Adj'] * x['Low']
deltas = x['Low'].iloc[1:].to_numpy() - x['Close'].iloc[:-1].to_numpy()
deltas = np.append([0.0], deltas)
x['delta'] = deltas
adjDeltas = x['Adj Low'].iloc[1:].to_numpy() - x['Adj Close'].iloc[:-1].to_numpy()
adjDeltas = np.append([0.0], adjDeltas)
x['adjDelta'] = adjDeltas
deltas = x[['delta', 'adjDelta']]
if div_pct > 0.05 and div_pct < 1.0:
adjDiv = div * x['Adj'].iloc[0]
f = deltas['adjDelta'] > (adjDiv*0.6)
if f.any():
indices = np.where(f)[0]
for idx in indices:
adjDelta_drop = deltas['adjDelta'].iloc[idx]
if adjDelta_drop > 1.001*deltas['delta'].iloc[idx]:
# Adjusted price has risen by more than unadjusted, should not happen.
# See if Adjusted price later falls by a similar amount. This would mean
# dividend has been applied too early.
ratios = (-1*deltas['adjDelta'])/adjDelta_drop
f_near1_or_above = ratios>=0.8
# Update: only check for wrong date if no coincident split.
# Because if a split, more likely the div is missing split
split = df2['Stock Splits'].loc[dt]
pre_split = div_status_df['div_pre_split'].loc[dt]
if (split==0.0 or (not pre_split)) and f_near1_or_above.any():
near_indices = np.where(f_near1_or_above)[0]
if len(near_indices) > 1:
penalties = np.zeros(len(near_indices))
for i in range(len(near_indices)):
idx = near_indices[i]
dti = ratios.index[idx]
if dti < dt:
penalties[i] += (dt-dti).days
else:
penalties[i] += 0.1*(dti-dt).days
i = np.argmin(penalties)
reversal_idx = near_indices[i]
else:
reversal_idx = near_indices[0]
div_date_wrong = True
div_true_date = ratios.index[reversal_idx]
break
elif adjDelta_drop > 0.39*adjDiv:
# Still true that applied adjustment exceeds price action,
# just not clear what solution is (if any).
if (x['Adj']<1.0).any():
div_adj_exceeds_prices = True
break
# Can prune the space:
div_adj_is_too_small = div_status_df.loc[dt, 'div_exceeds_adj']
if div_adj_exceeds_prices and div_adj_is_too_small:
# Contradiction. Assume former tricked by low-liquidity price action
div_adj_exceeds_prices = False
div_status = {}
div_status['adj_exceeds_prices'] = div_adj_exceeds_prices
div_status['div_date_wrong'] = div_date_wrong
div_status['div_true_date'] = div_true_date
if div_adj_exceeds_prices:
split = df2['Stock Splits'].loc[dt]
if split != 0.0:
# Check again if div missing split. Use looser tolerance
# as we know the adjustment seems wrong.
div_postSplit = div / split
if div_postSplit > div:
# Use volatility-adjusted drop
typical_volatility = div_status_df['vol'].loc[dt]
drop = div_status_df['drop'].loc[dt]
_drop = drop - typical_volatility
else:
drop_2Dmax = div_status_df['drop_2Dmax'].loc[dt]
_drop = drop_2Dmax
if _drop > 0:
diff = abs(div-_drop)
diff_postSplit = abs(div_postSplit-_drop)
if diff_postSplit <= (diff*1.1):
# possibilities.append({'state':'div-pre-split', 'diff':diff_postSplit})
div_status_df.loc[dt, 'div_pre_split'] = True
for k,v in div_status.items():
if k not in div_status_df:
if isinstance(v, (bool, np.bool_)):
div_status_df[k] = False
elif isinstance(v, int):
div_status_df[k] = 0
elif isinstance(v, float):
div_status_df[k] = 0.0
elif k == 'div_true_date':
div_status_df[k] = pd.Series(dtype='datetime64[ns, UTC]')
else:
raise ValueError(k,v,type(v))
div_status_df.loc[dt, k] = v
if 'div_too_big' in div_status_df.columns and 'div_date_wrong' in div_status_df.columns:
# Where div_date_wrong = True, discard div_too_big. Helps with false-positive handling later.
div_status_df.loc[div_status_df['div_date_wrong'].to_numpy(), 'div_too_big'] = False
checks += ['adj_exceeds_prices', 'div_date_wrong']
for c in checks:
if not div_status_df[c].any():
div_status_df = div_status_df.drop(c, axis=1)
c = 'div_true_date'
if c in div_status_df.columns and div_status_df[c].isna().all():
div_status_df = div_status_df.drop(c, axis=1)
checks = [c for c in checks if c in div_status_df.columns]
# With small dividends e.g. < 10%, my error detecting logic can be tricked by price volatility.
# But by looking at all the dividends, can find errors that previous logic missed.
div_status_df = div_status_df.sort_values('%')
div_status_df['cluster'] = cluster_dividends(div_status_df, column='%')
# Check for inconsistencies
cluster_ids = div_status_df['cluster'].unique()
for cid in cluster_ids:
fc = div_status_df['cluster'] == cid
cluster = div_status_df[fc].sort_index()
n = len(cluster)
div_pcts = cluster[['%']].copy()
if len(div_pcts) > 1:
time_diffs = div_pcts['%'].index.to_series().diff().dt.total_seconds() / (365.25 * 24 * 60 * 60)
time_diffs.loc[time_diffs.index[0]] = time_diffs.iloc[1]
div_pcts['period'] = time_diffs
div_pcts['avg yr yield'] = div_pcts['%'] / div_pcts['period']
for c in checks:
if not cluster[c].to_numpy().any():
cluster = cluster.drop(c, axis=1)
cluster_checks = [c for c in checks if c in cluster.columns]
for c in cluster_checks:
f_fail = cluster[c].to_numpy()
n_fail = np.sum(f_fail)
if n_fail in [0, n]:
continue
pct_fail = n_fail / n
if c == 'div_too_big':
true_threshold = 1.0
fals_threshold = 0.25
if 'div_date_wrong' in cluster.columns and (cluster[c] == cluster['div_date_wrong']).all():
continue
if 'adj_exceeds_prices' in cluster.columns and (cluster[c] == (cluster[c] & cluster['adj_exceeds_prices'])).all():
# Treat div_too_big=False as false positives IFF adj_exceeds_prices=true AND
# true ratio above (lowered) threshold.
true_threshold = 0.5
f_adj_exceeds_prices = cluster['adj_exceeds_prices'].to_numpy()
n = np.sum(f_adj_exceeds_prices)
n_fail = np.sum(f_fail[f_adj_exceeds_prices])
pct_fail = n_fail / n
if pct_fail > true_threshold:
f = fc & div_status_df['adj_exceeds_prices'].to_numpy()
div_status_df.loc[f, c] = True
continue
if 'div_exceeds_adj' in cluster.columns and cluster['div_exceeds_adj'].all():
# Dividend too big for prices AND the present adjustment,
# more likely the dividends are too big.
if (cluster['vol'][fc][f_fail]==0).all():
# No trading volume to cross-check, so higher thresholds
fals_threshold = 2/3
else:
# Relax thresholds
true_threshold = 0.25
elif 'adj_exceeds_prices' in cluster.columns and (cluster[c]==cluster['adj_exceeds_prices']).all():
# Both dividend and present adjust too big for prices,
# more likely the dividends are too big.
true_threshold = 1/2
else:
fals_threshold = 1/2
if pct_fail >= true_threshold:
div_status_df.loc[fc, c] = True
if 'div_date_wrong' in div_status_df.columns:
# reset this as well
div_status_df.loc[fc, 'div_date_wrong'] = False
div_status_df.loc[fc, 'div_true_date'] = pd.NaT
cluster = div_status_df[fc].sort_index()
continue
elif pct_fail <= fals_threshold:
div_status_df.loc[fc, c] = False
continue
if c == 'div_too_small':
true_threshold = 1.0
fals_threshold = 0.15
if 'adj_exceeds_div' not in cluster.columns:
# Adjustment confirms dividends => more likely that 'div_too_small' are false positives: NOT too small
true_threshold = 6/11
fals_threshold = 1/2
if pct_fail >= true_threshold:
div_status_df.loc[fc, c] = True
continue
elif pct_fail <= fals_threshold:
div_status_df.loc[fc, c] = False
continue
if c == 'adj_missing':
if cluster[c].iloc[-1] and n_fail == 1:
# Only the latest/last row is missing, genuine error
continue
if c == 'div_exceeds_adj':
continue
if c == 'adj_exceeds_prices':
continue
if c == 'phantom':
continue
if c == 'div_date_wrong':
# Fine, these should be rare
continue
if c in ['div_pre_split', 'div_too_big_and_pre_split']:
# Fine, these should be rare
continue
if 'div_too_big' in checks and 'div_exceeds_adj' in checks:
c = "adj_too_small"
div_status_df[c] = False
for i in range(len(div_status_df)):
dt = div_status_df.index[i]
row = div_status_df.iloc[i]
if row['div_too_big'] and row['div_exceeds_adj']:
# Check if div_too_big AND adj-too-small-for-prices
div_yield = row['div']
pct = row['%']
close = div_yield/pct
adj_present = row['present adj']
implied_div_yield = (1-adj_present)*close
ratio = div_yield/implied_div_yield
also_correct_adj = abs(ratio-(currency_divide*currency_divide)) < currency_divide
if also_correct_adj:
div_status_df.loc[dt, c] = True
if not div_status_df[c].any():
div_status_df = div_status_df.drop(c, axis=1)
else:
checks.append(c)
if 'div_too_big_and_pre_split' in div_status_df.columns:
for c in ['div_too_big', 'div_pre_split']:
if c in div_status_df:
div_status_df[c] = div_status_df[c] | div_status_df['div_too_big_and_pre_split']
else:
div_status_df[c] = div_status_df['div_too_big_and_pre_split']
checks.append(c)
div_status_df = div_status_df.drop('div_too_big_and_pre_split', axis=1)
checks.remove('div_too_big_and_pre_split')
div_status_df = div_status_df.sort_index()
# Discard dividends with no problems
div_status_df = div_status_df[div_status_df[checks].any(axis=1)]
if div_status_df.empty:
if not df2_nan.empty:
df2 = pd.concat([df2, df2_nan]).sort_index()
return df2
# These arrays track changes for constructing compact log messages
div_repairs = {}
for cid in list(div_status_df['cluster'].unique()):
cluster = div_status_df[div_status_df['cluster']==cid]
cluster = cluster.sort_index(ascending=False)
cluster['Fixed?'] = False
# Reverse order because may delete false-positives
for i in range(len(cluster)-1, -1, -1):
row = cluster.iloc[i]
dt = row.name
enddt = dt-_datetime.timedelta(seconds=1)
adj_missing = 'adj_missing' in row and row['adj_missing']
div_exceeds_adj = 'div_exceeds_adj' in row and row['div_exceeds_adj']
adj_exceeds_div = 'adj_exceeds_div' in row and row['adj_exceeds_div']
adj_exceeds_prices = 'adj_exceeds_prices' in row and row['adj_exceeds_prices']
div_too_small = 'div_too_small' in row and row['div_too_small']
div_too_big = 'div_too_big' in row and row['div_too_big']
div_pre_split = 'div_pre_split' in row and row['div_pre_split']
# div_too_small_and_pre_split = 'div_too_small_and_pre_split' in row and row['div_too_small_and_pre_split'] # not happened yet
# div_too_big_and_pre_split = 'div_too_big_and_pre_split' in row and row['div_too_big_and_pre_split'] # not happened yet
div_date_wrong = 'div_date_wrong' in row and row['div_date_wrong']
adj_too_small = 'adj_too_small' in row and row['adj_too_small']
n_failed_checks = np.sum([row[c] for c in checks if c in row])
if div_too_big and adj_exceeds_prices and n_failed_checks == 2:
# adj_exceeds_prices is redundant information, fixing div-too-big
# will fix adjustment
adj_exceeds_prices = False
n_failed_checks -= 1
if div_date_wrong:
if div_too_big:
# redundant information
div_too_big = False
cluster.loc[dt, 'div_too_big'] = False
n_failed_checks -= 1
if div_exceeds_adj:
# false-positive
div_exceeds_adj = False
cluster.loc[dt, 'div_exceeds_adj'] = False
n_failed_checks -= 1
if div_pre_split:
if adj_exceeds_prices:
# redundant information
adj_exceeds_prices = False
cluster.loc[dt, 'adj_exceeds_prices'] = False
n_failed_checks -= 1
if n_failed_checks == 1:
if div_exceeds_adj or adj_exceeds_div:
# Simply recalculate Adj Close
k = 'too-small div-adjust' if div_exceeds_adj else 'too-big div-adjust'
div_repairs.setdefault(k, []).append(dt)
adj_correction = (1.0 - row['%']) / row['present adj']
df2.loc[ :enddt, 'Adj Close'] *= adj_correction
df2.loc[ :enddt, 'Repaired?'] = True
df2_nan.loc[:enddt, 'Adj Close'] *= adj_correction
df2_nan.loc[:enddt, 'Repaired?'] = True
cluster.loc[dt, 'Fixed?'] = True
elif div_too_small:
# Fix both dividend and adjustment
# - div_too_small looks fine, the adj also needs repair because compared against div
k = 'too-small div'
correction = currency_divide
correct_div = row['div'] * correction
df2.loc[dt, 'Dividends'] = correct_div
# adj is correct *compared to the present div*, so needs rescaling
# to match corrected dividend
k += ' & div-adjust'
target_adj = 1.0 - ((1.0 - row['present adj']) * correction)
adj_correction = target_adj / row['present adj']
df2.loc[ :enddt, 'Adj Close'] *= adj_correction
df2.loc[ :enddt, 'Repaired?'] = True
df2_nan.loc[:enddt, 'Adj Close'] *= adj_correction
df2_nan.loc[:enddt, 'Repaired?'] = True
cluster.loc[dt, 'Fixed?'] = True
div_repairs.setdefault(k, []).append(dt)
elif div_too_big:
k = 'too-big div'
correction = 1.0/currency_divide
correct_div = row['div'] * correction
df2.loc[dt, 'Dividends'] = correct_div
target_div_pct = row['%'] * correction
target_adj = 1.0 - target_div_pct
present_adj = row['present adj']
k += ' & div-adjust'
adj_correction = target_adj / present_adj
df2.loc[ :enddt, 'Adj Close'] *= adj_correction
df2.loc[ :enddt, 'Repaired?'] = True
df2_nan.loc[:enddt, 'Adj Close'] *= adj_correction
df2_nan.loc[:enddt, 'Repaired?'] = True
cluster.loc[dt, 'Fixed?'] = True
div_repairs.setdefault(k, []).append(dt)
elif adj_missing:
k = 'missing div-adjust'
div_repairs.setdefault(k, []).append(dt)
adj_correction = 1.0-row['%']
df2.loc[ :enddt, 'Adj Close'] *= adj_correction
df2.loc[ :enddt, 'Repaired?'] = True
df2_nan.loc[:enddt, 'Adj Close'] *= adj_correction
df2_nan.loc[:enddt, 'Repaired?'] = True
cluster.loc[dt, 'Fixed?'] = True
elif div_date_wrong:
k = 'wrong ex-div date'
div_repairs.setdefault(k, []).append(dt)
# First rollback the present adj
adj_correction = 1.0/row['present adj']
df2.loc[ :enddt, 'Adj Close'] *= adj_correction
df2_nan.loc[:enddt, 'Adj Close'] *= adj_correction
# Apply correct adj from correct date
div_true_date = row['div_true_date']
close_before = df2['Close'].iloc[row['idx']]
div = row['div']
true_adj = 1.0 - div/close_before
enddt2 = div_true_date-_datetime.timedelta(seconds=1)
df2.loc[ :enddt2, 'Adj Close'] *= true_adj
df2_nan.loc[:enddt2, 'Adj Close'] *= true_adj
# Move div to correct date
df2.loc[div_true_date, 'Dividends'] += div
df2.loc[dt, 'Dividends'] = 0
df2.loc[ :enddt, 'Repaired?'] = True
df2_nan.loc[:enddt, 'Repaired?'] = True
cluster.loc[dt, 'Fixed?'] = True
elif adj_exceeds_prices:
# Nothing else wrong => probably false positive,
# but no harm checking the adjustment
target_adj = 1.0 - row['%']
present_adj = row['present adj']
if abs((target_adj/present_adj)-1) > 0.05:
# Also correct adjustment to match corrected dividend
k += ' & div-adjust'
adj_correction = target_adj / present_adj
df2.loc[ :enddt, 'Adj Close'] *= adj_correction
df2.loc[ :enddt, 'Repaired?'] = True
df2_nan.loc[:enddt, 'Adj Close'] *= adj_correction
df2_nan.loc[:enddt, 'Repaired?'] = True
cluster.loc[dt, 'Fixed?'] = True
else:
div_status_df = div_status_df.drop(dt)
cluster = cluster.drop(dt)
elif div_pre_split:
k = 'pre-split div'
correction = 1.0/df2['Stock Splits'].loc[dt]
correct_div = row['div'] * correction
df2.loc[dt, 'Dividends'] = correct_div
target_div_pct = row['%'] * correction
target_adj = 1.0 - target_div_pct
present_adj = row['present adj']
# Also correct adjustment to match corrected dividend
k += ' & div-adjust'
adj_correction = target_adj / present_adj
df2.loc[ :enddt, 'Adj Close'] *= adj_correction
df2.loc[ :enddt, 'Repaired?'] = True
df2_nan.loc[:enddt, 'Adj Close'] *= adj_correction
df2_nan.loc[:enddt, 'Repaired?'] = True
cluster.loc[dt, 'Fixed?'] = True
div_repairs.setdefault(k, []).append(dt)
elif n_failed_checks == 2:
if div_too_big and adj_missing:
# A currency unit mixup AND adjustment missing
k = 'too-big div and missing div-adjust'
div_repairs.setdefault(k, []).append(dt)
adj_correction = 1.0 - row['%']/currency_divide
df2.loc[dt, 'Dividends'] /= currency_divide
df2.loc[ :enddt, 'Adj Close'] *= adj_correction
df2.loc[ :enddt, 'Repaired?'] = True
df2_nan.loc[:enddt, 'Adj Close'] *= adj_correction
df2_nan.loc[:enddt, 'Repaired?'] = True
cluster.loc[dt, 'Fixed?'] = True
elif div_too_big and div_exceeds_adj:
div = row['div']
close = div/row['%']
adj_present = row['present adj']
# Adj Close is correct, just need to fix Dividend.
# Probably just a currency unit mixup.
df2.loc[dt, 'Dividends'] /= currency_divide
k = 'div-too-big'
div_repairs.setdefault(k, []).append(dt)
cluster.loc[dt, 'Fixed?'] = True
elif div_too_big and adj_exceeds_prices:
# Assume div 100x error, and that Yahoo used this wrong dividend.
# 'adj_too_big=True' is probably redundant information, knowing div too big
# is enough to require also fixing adjustment
k = 'too-big div & div-adjust'
div_repairs.setdefault(k, []).append(dt)
target_div_pct = row['%']/currency_divide
target_adj = 1.0 - target_div_pct
adj_correction = target_adj / row['present adj']
df2.loc[dt, 'Dividends'] /= currency_divide
df2.loc[ :enddt, 'Adj Close'] *= adj_correction
df2.loc[ :enddt, 'Repaired?'] = True
df2_nan.loc[:enddt, 'Adj Close'] *= adj_correction
df2_nan.loc[:enddt, 'Repaired?'] = True
cluster.loc[dt, 'Fixed?'] = True
elif div_too_small and adj_exceeds_div:
# Adj Close is correct, just need to fix Dividend.
# Probably just a currency unit mixup.
df2.loc[dt, 'Dividends'] *= currency_divide
k = 'too-small div'
if 'FX was repaired' in row and row['FX was repaired']:
# Complication: not just a currency unit mixup, but
# mixed up the local currency with $. So need to
# recalculate adjustment.
msg = None
div_adj = 1.0 - (row['%']*currency_divide)
adj_correction = div_adj / row['present adj']
df2.loc[ :enddt, 'Adj Close'] *= adj_correction
df2.loc[ :enddt, 'Repaired?'] = True
df2_nan.loc[:enddt, 'Adj Close'] *= adj_correction
df2_nan.loc[:enddt, 'Repaired?'] = True
# Currently not logging this FX-fix event, since I refactored fixing.
k += " and FX mixup"
div_repairs.setdefault(k, []).append(dt)
cluster.loc[dt, 'Fixed?'] = True
elif n_failed_checks == 3:
if div_too_big and div_exceeds_adj and div_pre_split:
k = 'too-big div & pre-split'
correction = (1.0/currency_divide) * (1.0/df2['Stock Splits'].loc[dt])
correct_div = row['div'] * correction
df2.loc[dt, 'Dividends'] = correct_div
target_div_pct = row['%'] * correction
target_adj = 1.0 - target_div_pct
present_adj = row['present adj']
# Also correct adjustment to match corrected dividend
k += ' & div-adjust'
adj_correction = target_adj / present_adj
df2.loc[ :enddt, 'Adj Close'] *= adj_correction
df2.loc[ :enddt, 'Repaired?'] = True
df2_nan.loc[:enddt, 'Adj Close'] *= adj_correction
df2_nan.loc[:enddt, 'Repaired?'] = True
cluster.loc[dt, 'Fixed?'] = True
div_repairs.setdefault(k, []).append(dt)
elif div_too_big and div_exceeds_adj and adj_too_small:
# Need to fix dividend AND adj close.
# Probably just a currency unit mixup.
div = row['div']
close = div/row['%']
adj_present = row['present adj']
k = 'div-too-big and adj-too-small'
#
div_true = div/currency_divide
pct_true = div_true / close
df2.loc[dt, 'Dividends'] = div_true
#
adj_correct = 1.0 - pct_true
adj_correction = adj_correct / adj_present
df2.loc[ :enddt, 'Adj Close'] *= adj_correction
df2.loc[ :enddt, 'Repaired?'] = True
df2_nan.loc[:enddt, 'Adj Close'] *= adj_correction
df2_nan.loc[:enddt, 'Repaired?'] = True
div_repairs.setdefault(k, []).append(dt)
cluster.loc[dt, 'Fixed?'] = True
if cluster.empty:
continue
for k in div_repairs:
msg = f"Repaired {k}: {[str(dt.date()) for dt in sorted(div_repairs[k])]}"
logger.info(msg, extra=log_extras)
if not df2_nan.empty:
df2 = pd.concat([df2, df2_nan]).sort_index()
return df2
@utils.log_indent_decorator
def _fix_bad_stock_splits(self, df, interval, tz_exchange):
# Original logic only considered latest split adjustment could be missing, but
# actually **any** split adjustment can be missing. So check all splits in df.
#
# Improved logic looks for BIG daily price changes that closely match the
# **nearest future** stock split ratio. This indicates Yahoo failed to apply a new
# stock split to old price data.
#
# There is a slight complication, because Yahoo does another stupid thing.
# Sometimes the old data is adjusted twice. So cannot simply assume
# which direction to reverse adjustment - have to analyse prices and detect.
# Not difficult.
if df.empty:
return df
logger = utils.get_yf_logger()
log_extras = {'yf_cat': 'split-repair', 'yf_interval': interval, 'yf_symbol': self.ticker}
interday = interval in ['1d', '1wk', '1mo', '3mo']
if not interday:
return df
df = df.sort_index() # scan splits oldest -> newest
split_f = df['Stock Splits'].to_numpy() != 0
if not split_f.any():
logger.debug('price-repair-split: No splits in data')
return df
logger.debug(f'Splits: {str(df["Stock Splits"][split_f].to_dict())}', extra=log_extras)
if 'Repaired?' not in df.columns:
df['Repaired?'] = False
for split_idx in np.where(split_f)[0]:
split_dt = df.index[split_idx]
split = df.loc[split_dt, 'Stock Splits']
if split_dt == df.index[0]:
continue
# Add on a week:
if interval in ['1wk', '1mo', '3mo']:
split_idx += 1
else:
split_idx += 5
cutoff_idx = min(df.shape[0], split_idx) # add one row after to detect big change
df_pre_split = df.iloc[0:cutoff_idx+1]
logger.debug(f'split_idx={split_idx} split_dt={split_dt.date()} split={split:.4f}', extra=log_extras)
logger.debug(f'df dt range: {df_pre_split.index[0].date()} -> {df_pre_split.index[-1].date()}', extra=log_extras)
df_pre_split_repaired = self._fix_prices_sudden_change(df_pre_split, interval, tz_exchange, split, correct_volume=True, correct_dividend=True)
# Merge back in:
if cutoff_idx == df.shape[0]-1:
df = df_pre_split_repaired
else:
df_post_cutoff = df.iloc[cutoff_idx+1:]
if df_post_cutoff.empty:
df = df_pre_split_repaired.sort_index()
else:
df = pd.concat([df_pre_split_repaired.sort_index(), df_post_cutoff])
return df
@utils.log_indent_decorator
def _fix_prices_sudden_change(self, df, interval, tz_exchange, change, unit_switch=False, correct_volume=False, correct_dividend=False):
if df.empty:
return df
logger = utils.get_yf_logger()
log_extras = {'yf_cat': 'price-change-repair', 'yf_interval': interval, 'yf_symbol': self.ticker}
split = change
split_rcp = 1.0 / split
interday = interval in ['1d', '1wk', '1mo', '3mo']
multiday = interval in ['1wk', '1mo', '3mo']
if unit_switch:
fix_type = '100x error'
log_extras['yf_cat'] = 'price-repair-100x'
start_min = None
else:
fix_type = 'bad split'
log_extras['yf_cat'] = 'price-repair-split'
# start_min = 1 year before oldest split
f = df['Stock Splits'].to_numpy() != 0.0
start_min = (df.index[f].min() - _dateutil.relativedelta.relativedelta(years=1)).date()
logger.debug(f'start_min={start_min} change={change:.4f} (rcp={1.0/change:.4f})', extra=log_extras)
OHLC = ['Open', 'High', 'Low', 'Close']
if interday and interval != '1d':
# Yahoo creates multi-day intervals using potentiall corrupt data, e.g.
# the Close could be 100x Open. This means have to correct each OHLC column
# individually
correct_columns_individually = True
else:
correct_columns_individually = False
# Do not attempt repair of the split is small,
# could be mistaken for normal price variance
if 0.8 < split < 1.25:
logger.debug("Split ratio too close to 1. Won't repair", extra=log_extras)
return df
df2 = df.copy().sort_index(ascending=False)
if df2.index.tz is None:
df2.index = df2.index.tz_localize(tz_exchange)
elif df2.index.tz != tz_exchange:
df2.index = df2.index.tz_convert(tz_exchange)
n = df2.shape[0]
# If stock is currently suspended and not in USA, then usually Yahoo introduces
# 100x errors into suspended intervals. Clue is no price change and 0 volume.
# Better to use last active trading interval as baseline.
# f_no_activity = (df2['Low'] == df2['High']) & (df2['Volume']==0)
# Update: intra-interval 100x/0.01x errors can trick Low==High
f_no_activity = df2['Volume']==0
f_no_activity = f_no_activity | df2[OHLC].isna().all(axis=1)
appears_suspended = f_no_activity.any() and np.where(f_no_activity)[0][0]==0
f_active = ~f_no_activity
idx_latest_active = np.where(f_active & np.roll(f_active, 1))[0]
if len(idx_latest_active) == 0:
# In rare cases, not enough trading activity for 2+ consecutive days e.g. CLC.L
idx_latest_active = np.where(f_active)[0]
if len(idx_latest_active) == 0:
idx_latest_active = None
else:
idx_latest_active = int(idx_latest_active[0])
log_msg = f'appears_suspended={appears_suspended}, idx_latest_active={idx_latest_active}'
if idx_latest_active is not None:
log_msg += f' ({df2.index[idx_latest_active].date()})'
logger.debug(log_msg, extra=log_extras)
df_workings = df2.copy()
df_workings = df_workings.drop(['Adj Close', 'Dividends', 'Stock Splits', 'Repaired?'], axis=1, errors='ignore')
df_workings = df_workings.rename(columns={'Volume': 'Vol'})
fna = df_workings['Vol'].isna()
if fna.any():
df_workings['VolStr'] = ''
df_workings.loc[fna, 'VolStr'] = 'NaN'
df_workings.loc[~fna, 'VolStr'] = (df_workings['Vol'][~fna]/1e6).astype('int').astype('str') + 'm'
df_workings['Vol'] = df_workings['VolStr']
df_workings.drop('VolStr', axis=1)
else:
df_workings['Vol'] = (df_workings['Vol']/1e6).astype('int').astype('str') + 'm'
debug_cols = ['Close']
df_workings = df_workings.drop([c for c in OHLC if c not in debug_cols], axis=1, errors='ignore')
# Calculate daily price % change. To reduce effect of price volatility,
# calculate change for each OHLC column.
if interday and interval != '1d' and split not in [100.0, 100, 0.001]:
# Avoid using 'Low' and 'High'. For multiday intervals, these can be
# very volatile which reduces ability to detect genuine stock split errors
_1d_change_x = np.full((n, 2), 1.0)
price_data_cols = ['Open','Close']
price_data = df2[price_data_cols].to_numpy()
f_zero = price_data == 0.0
else:
_1d_change_x = np.full((n, 4), 1.0)
price_data_cols = OHLC
price_data = df2[price_data_cols].to_numpy()
f_zero = price_data == 0.0
if not price_data.flags.writeable:
price_data = price_data.copy()
if f_zero.any():
price_data[f_zero] = 1.0
# Update: if a VERY large dividend is paid out, then can be mistaken for a 1:2 stock split.
# Fix = use adjusted prices
f_zero = df2['Close'] == 0
if f_zero.any():
adj = np.ones(len(df2))
adj[~f_zero] = df2['Adj Close'].to_numpy()[~f_zero] / df2['Close'].to_numpy()[~f_zero]
else:
adj = df2['Adj Close'].to_numpy() / df2['Close'].to_numpy()
df_dtype = price_data.dtype
if df_dtype == np.int64:
price_data = price_data.astype('float')
for j in range(price_data.shape[1]):
price_data[:,j] *= adj
if OHLC[j] in df_workings.columns:
df_workings[price_data_cols[j]] *= adj
if df_dtype == np.int64:
price_data = price_data.astype('int')
_1d_change_x[1:] = price_data[1:, ] / price_data[:-1, ]
f_zero_num_denom = f_zero | np.roll(f_zero, 1, axis=0)
if f_zero_num_denom.any():
_1d_change_x[f_zero_num_denom] = 1.0
if interday and interval != '1d':
# average change
_1d_change_denoised = np.average(_1d_change_x, axis=1)
else:
# # change nearest to 1.0
# diff = np.abs(_1d_change_x - 1.0)
# j_indices = np.argmin(diff, axis=1)
# _1d_change_denoised = _1d_change_x[np.arange(n), j_indices]
# Still sensitive to extreme-low low. Try median:
_1d_change_denoised = np.median(_1d_change_x, axis=1)
f_na = np.isnan(_1d_change_denoised)
if f_na.any():
# Possible if data was too old for reconstruction.
_1d_change_denoised[f_na] = 1.0
# If all 1D changes are closer to 1.0 than split, exit
split_max = max(split, split_rcp)
if np.max(_1d_change_denoised) < (split_max - 1) * 0.5 + 1 and np.min(_1d_change_denoised) > 1.0 / ((split_max - 1) * 0.5 + 1):
logger.debug(f'No {fix_type}s detected', extra=log_extras)
return df
# Calculate the true price variance, i.e. remove effect of bad split-adjustments.
# Key = ignore 1D changes outside of interquartile range
q1, q3 = np.percentile(_1d_change_denoised, [25, 75])
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
f = (_1d_change_denoised >= lower_bound) & (_1d_change_denoised <= upper_bound)
avg = np.mean(_1d_change_denoised[f])
sd = np.std(_1d_change_denoised[f])
# Now can calculate SD as % of mean
sd_pct = sd / avg
logger.debug(f"Estimation of true 1D change stats: mean = {avg:.2f}, StdDev = {sd:.4f} ({sd_pct*100.0:.1f}% of mean)", extra=log_extras)
# Only proceed if split adjustment far exceeds normal 1D changes
largest_change_pct = 5 * sd_pct
if interday and interval != '1d':
largest_change_pct *= 3
if interval in ['1mo', '3mo']:
largest_change_pct *= 2
if max(split, split_rcp) < 1.0 + largest_change_pct:
logger.debug("Split ratio too close to normal price volatility. Won't repair", extra=log_extras)
logger.debug(f"sd_pct = {sd_pct:.4f} largest_change_pct = {largest_change_pct:.4f}", extra=log_extras)
return df
# Now can detect bad split adjustments
# Set threshold to halfway between split ratio and largest expected normal price change
r = _1d_change_denoised / split_rcp
split_max = max(split, split_rcp)
logger.debug(f"split_max={split_max:.3f} largest_change_pct={largest_change_pct:.4f}", extra=log_extras)
threshold = (split_max + 1.0 + largest_change_pct) * 0.5
logger.debug(f"threshold={threshold:.3f}, threshold_rcp={1.0/threshold:.3f}", extra=log_extras)
sudden_change_repaired = np.full(len(df2), False)
if correct_columns_individually:
_1d_change_x = np.full((n, 4), 1.0)
price_data = df2[OHLC].replace(0.0, 1.0).to_numpy()
price_data_cols = OHLC
# _1d_change_x = np.full((n, len(price_data_cols)), 1.0)
# price_data = df2[price_data_cols].replace(0.0, 1.0).to_numpy()
_1d_change_x[1:] = price_data[1:, ] / price_data[:-1, ]
else:
_1d_change_x = _1d_change_denoised
if correct_columns_individually:
for j in range(len(price_data_cols)):
c = price_data_cols[j]
df_workings[c+' 1D %'] = _1d_change_x[:, j]
df_workings[c+' 1D %'] = df_workings[c+' 1D %'].round(3)
else:
df_workings['1D %'] = _1d_change_denoised
# df_workings['1D %'] = df_workings['1D %'].round(2).astype('str')
df_workings['1D %'] = df_workings['1D %'].round(3)
r = _1d_change_x / split_rcp
f_down = _1d_change_x < 1.0 / threshold
# if f_down.any():
# # Discard where triggered by negative Adj Close after dividend
# f_neg = _1d_change_x < 0.0
# f_div = (df2['Dividends']>0).to_numpy()
# f_div_before = np.roll(f_div, 1)
# if f_down.ndim == 2:
# f_div_before = f_div_before[:, np.newaxis].repeat(f_down.shape[1], axis=1)
# f_down = f_down & ~(f_neg + f_div_before)
f_up = _1d_change_x > threshold
f_up_ndims = len(f_up.shape)
f_up_shifts = f_up if f_up_ndims==1 else f_up.any(axis=1)
# In rare cases e.g. real disasters, the price actually drops massively on huge volume
if f_up_shifts.any():
nf_up_shifts = ~f_up_shifts
flat_indices = np.where(nf_up_shifts)[0]
f_down_ndims = len(f_down.shape)
down_dts = df2.index[f_down if f_down_ndims==1 else f_down.any(axis=1)]
for idx in np.where(f_up_shifts)[0]:
i = idx-1 # this is when price actually dropped
dt = df2.index[i]
v = df2['Volume'].iloc[i]
vol_change_pct = 0 if v == 0 else df2['Volume'].iloc[i-1] / v
logger.debug(f"- vol_change_pct = {vol_change_pct:.4f}")
if multiday and (i+1 < len(df2)):
next_v = df2['Volume'].iloc[i+1]
if next_v > 0:
vol_change_pct = max(vol_change_pct, df2['Volume'].iloc[i] / next_v)
# if vol_change_pct > 5:
# # big volume change +500%
# # Could be false-positive, but need some more checks
# lookback = max(0, i-10)
# lookahead = min(len(df2), i+10)
# if (df2['Stock Splits'].iloc[lookback:lookahead]!=0.0).any():
# # There's a stock split near the volume spike, so
# # assume false positive
# continue
# avg_vol_after = df2['Volume'].iloc[lookback:i-1].mean()
# if not np.isnan(avg_vol_after) and avg_vol_after > 0 and v/avg_vol_after < 2.0:
# # volume spike is actually a step-change, so
# # probably missing stock split
# continue
# if f_up_ndims == 1:
# f_up[idx] = False
# else:
# f_up[idx,:] = False
# New method: look for a volume spike
# Select 20 rows after i (earlier in time)
# are not triggers (big price moves).
i_pos_in_flat_indices = nf_up_shifts[:i].sum()
start = max(0, i_pos_in_flat_indices - 15)
end = min(len(flat_indices), start+30+1)
block = df2.iloc[flat_indices[start:end]]
block = block.sort_index()
# block_before = block.loc[:dt-_datetime.timedelta(1)]
down_dts_from = down_dts[down_dts>=dt]
if len(down_dts_from) > 0:
next_down_dt = min(down_dts_from)
if next_down_dt == dt:
# Only this row has price drop, so will look like a volume spike but
# is definitely a data error to repair.
block_after = None
else:
block_after = block.loc[dt+_datetime.timedelta(1):next_down_dt-_datetime.timedelta(1)]
else:
block_after = block.loc[dt+_datetime.timedelta(1):]
if block_after is not None and block_after.empty:
block_after = None
def _calc_volume_zscore_weighted(volume, dt, block):
distances = np.abs((block.index - dt).total_seconds())
distances /= distances.max()
weights = np.exp(-distances)
weights = np.array(weights) / np.sum(weights)
values = block['Volume'].to_numpy()
weighted_mean = np.sum(values * weights)
weighted_variance = np.sum(weights * (values - weighted_mean) ** 2)
weighted_std = np.sqrt(weighted_variance)
# print(f"# weighted_variance = {weighted_variance:.4f}")
# print(f"# weighted_std = {weighted_std:.4f}")
z_score = (volume - weighted_mean) / weighted_std
# print(f"z_score = {z_score:.4f}")
return z_score
def _calc_volume_zscore(volume, block):
# print(f"_calc_volume_zscore(volume={volume})")
values = block['Volume'].to_numpy()
if len(values) == 0 or (values == 0).all():
return 0
std = np.std(values, ddof=1)
if std == 0.0:
return 0
mean = np.mean(values)
z_score = (volume - mean) / std
return z_score
# z_score_before = _calc_volume_zscore(v, block_before)
# print(f"z_score_before = {z_score_before:.4f}")
if block_after is not None:
z_score_after = _calc_volume_zscore(v, block_after)
# print(f"z_score_after = {z_score_after:.4f}")
z_score_after_d1 = _calc_volume_zscore(block_after['Volume'].iloc[0], block_after)
# print(f"z_score_after_d1 = {z_score_after_d1:.4f}")
# z_score_after_d2 = _calc_volume_zscore(block_after['Volume'].iloc[1], block_after)
# print(f"z_score_after_d2 = {z_score_after_d2:.4f}")
if max(z_score_after, z_score_after_d1) > 2:
# There was a volume spike around this date, so
# probably something happened NOT a missing stock split.
logger.debug(f"Detected false-positive split error on {dt.date()}, ignoring price drop")
if f_up_ndims == 1:
f_up[idx] = False
else:
f_up[idx,:] = False
f = f_down | f_up
if not correct_columns_individually:
df_workings['r'] = r
df_workings['down'] = f_down
df_workings['up'] = f_up
df_workings['r'] = df_workings['r'].round(2).astype('str')
df_workings['f'] = f
else:
for j in range(len(price_data_cols)):
c = price_data_cols[j]
df_workings[c+'_r'] = r[:, j]
df_workings[c+'_r'] = df_workings[c+'_r'].round(2).astype('str')
df_workings[c+'_down'] = f_down[:, j]
df_workings[c+'_up'] = f_up[:, j]
df_workings[c+'_f'] = f[:, j]
# Possible that extreme events caused the price spikes/dumps.
# So for each signal, calculate local stdev for a custom threshold.
for idx in np.where(f)[0]:
dt = df2.index[idx]
idx_end = min(len(df2)-1, idx+2)
if interval.endswith('d'):
lookback = 10
elif interval.endswith('m'):
lookback = 100
else:
lookback = 3
idx_start = max(0, idx-lookback)
changes_local = df_workings.iloc[idx_start:idx_end]
if correct_columns_individually:
cols = price_data_cols
else:
cols = ['n/a']
for c in cols:
if c == 'n/a':
clean_changes = changes_local['1D %'][~changes_local['f']].to_numpy()
else:
clean_changes = changes_local[c+' 1D %'][~changes_local[c+'_f']].to_numpy()
avg = np.mean(clean_changes)
sd = np.std(clean_changes)
sd_pct = sd / avg
largest_change_pct = 5 * sd_pct
if interday and interval != '1d':
largest_change_pct *= 3
if interval in ['1mo', '3mo']:
largest_change_pct *= 2
threshold = (split_max + 1.0 + largest_change_pct) * 0.5
if correct_columns_individually:
big_change = df_workings[c+' 1D %'].iloc[idx]
else:
big_change = df_workings['1D %'].iloc[idx]
if big_change < threshold and big_change > 1.0/threshold:
# This price change is actually similar to local price volatily. False positive
if correct_columns_individually:
logger.debug(f"Unusual '{c}' price action @ {dt.date()} is actually similar to local price volatility, so ignoring (StdDev % mean = {sd_pct*100:.1f}%)")
df_workings.loc[dt, c+'_f'] = False
else:
logger.debug(f"Unusual price action @ {dt.date()} is actually similar to local price volatility, so ignoring (StdDev % mean = {sd_pct*100:.1f}%)")
df_workings.loc[dt, 'f'] = False
if not correct_columns_individually:
f_down = f_down & df_workings['f'].to_numpy()
f_up = f_up & df_workings['f'].to_numpy()
else:
for j in range(len(price_data_cols)):
c = price_data_cols[j]
if c in debug_cols:
f_down[:, j] = f_down[:, j] & df_workings[c+'_f']
f_up[:, j] = f_up[:, j] & df_workings[c+'_f']
f = f_down | f_up
if not f.any():
logger.debug(f'No {fix_type}s detected', extra=log_extras)
return df
# Update: if any 100x changes are soon after a stock split, so could be confused with split error, then abort
threshold_days = 30
f_splits = df2['Stock Splits'].to_numpy() != 0.0
if change in [100.0, 0.01] and f_splits.any():
indices_A = np.where(f_splits)[0]
indices_B = np.where(f)[0]
if not len(indices_A) or not len(indices_B):
return None
gaps = indices_B[:, None] - indices_A
# Because data is sorted in DEscending order, need to flip gaps
gaps *= -1
f_pos = gaps > 0
if f_pos.any():
gap_min = gaps[f_pos].min()
gap_td = utils._interval_to_timedelta(interval) * gap_min
if isinstance(gap_td, _dateutil.relativedelta.relativedelta):
threshold = _dateutil.relativedelta.relativedelta(days=threshold_days)
else:
threshold = _datetime.timedelta(days=threshold_days)
if isinstance(threshold, _dateutil.relativedelta.relativedelta) and isinstance(gap_td, _dateutil.relativedelta.relativedelta):
idx = np.where(gaps==gap_min)[0][0]
dt = df2.index[idx]
within_threshold = (dt + gap_td) < (dt + threshold)
else:
within_threshold = gap_td < threshold
if within_threshold:
logger.info('100x changes are too soon after stock split events, aborting', extra=log_extras)
return df
if logger.isEnabledFor(logging.DEBUG):
df_workings['i'] = list(range(0, df_workings.shape[0]))
df_workings['i_rev'] = df_workings.shape[0]-1 - df_workings['i']
if correct_columns_individually:
f_change = df_workings[[c+'_down' for c in debug_cols]].any(axis=1) | df_workings[[c+'_up' for c in debug_cols]].any(axis=1)
else:
f_change = df_workings['down'] | df_workings['up']
f_change = f_change | np.roll(f_change, -1) | np.roll(f_change, 1) | np.roll(f_change, -2) | np.roll(f_change, 2)
with pd.option_context('display.max_rows', None, 'display.max_columns', 10, 'display.width', 1000): # more options can be specified also
logger.debug("price-repair-split: my workings:" + '\n' + str(df_workings[f_change]))
def map_signals_to_ranges(f, f_up, f_down):
# Ensure 0th element is False, because True is nonsense
if f[0]:
f = np.copy(f)
f[0] = False
f_up = np.copy(f_up)
f_up[0] = False
f_down = np.copy(f_down)
f_down[0] = False
if not f.any():
return []
true_indices = np.where(f)[0]
ranges = []
for i in range(len(true_indices) - 1):
if i % 2 == 0:
if split > 1.0:
adj = 'split' if f_down[true_indices[i]] else '1.0/split'
else:
adj = '1.0/split' if f_down[true_indices[i]] else 'split'
ranges.append((true_indices[i], true_indices[i + 1], adj))
if len(true_indices) % 2 != 0:
if split > 1.0:
adj = 'split' if f_down[true_indices[-1]] else '1.0/split'
else:
adj = '1.0/split' if f_down[true_indices[-1]] else 'split'
ranges.append((true_indices[-1], len(f), adj))
return ranges
any_m_lt_1 = False
if idx_latest_active is not None:
idx_rev_latest_active = df.shape[0] - 1 - idx_latest_active
logger.debug(f'idx_latest_active={idx_latest_active}, idx_rev_latest_active={idx_rev_latest_active}', extra=log_extras)
if correct_columns_individually:
f_corrected = np.full(n, False)
if correct_volume:
# If Open or Close is repaired but not both,
# then this means the interval has a mix of correct
# and errors. A problem for correcting Volume,
# so use a heuristic:
# - if both Open & Close were Nx bad => Volume is Nx bad
# - if only one of Open & Close are Nx bad => Volume is 0.5*Nx bad
f_open_fixed = np.full(n, False)
f_close_fixed = np.full(n, False)
OHLC_correct_ranges = [None, None, None, None]
for j in range(len(OHLC)):
c = OHLC[j]
idx_first_f = np.where(f)[0][0]
if appears_suspended and (idx_latest_active is not None and idx_latest_active >= idx_first_f):
# Suspended midway during data date range.
# 1: process data before suspension in index-ascending (date-descending) order.
# 2: process data after suspension in index-descending order. Requires signals to be reversed,
# then returned ranges to also be reversed, because this logic was originally written for
# index-ascending (date-descending) order.
fj = f[:, j]
f_upj = f_up[:, j]
f_downj = f_down[:, j]
ranges_before = map_signals_to_ranges(fj[idx_latest_active:], f_upj[idx_latest_active:], f_downj[idx_latest_active:])
if len(ranges_before) > 0:
# Shift each range back to global indexing
for i in range(len(ranges_before)):
r = ranges_before[i]
ranges_before[i] = (r[0] + idx_latest_active, r[1] + idx_latest_active, r[2])
f_rev_downj = np.flip(np.roll(f_upj, -1)) # correct
f_rev_upj = np.flip(np.roll(f_downj, -1)) # correct
f_revj = f_rev_upj | f_rev_downj
ranges_after = map_signals_to_ranges(f_revj[idx_rev_latest_active:], f_rev_upj[idx_rev_latest_active:], f_rev_downj[idx_rev_latest_active:])
if len(ranges_after) > 0:
# Shift each range back to global indexing:
for i in range(len(ranges_after)):
r = ranges_after[i]
ranges_after[i] = (r[0] + idx_rev_latest_active, r[1] + idx_rev_latest_active, r[2])
# Flip range to normal ordering
for i in range(len(ranges_after)):
r = ranges_after[i]
ranges_after[i] = (n-r[1], n-r[0], r[2])
ranges = ranges_before
ranges.extend(ranges_after)
else:
ranges = map_signals_to_ranges(f[:, j], f_up[:, j], f_down[:, j])
logger.debug(f"column '{c}' ranges: {ranges}", extra=log_extras)
if start_min is not None:
# Prune ranges that are older than start_min
for i in range(len(ranges)-1, -1, -1):
r = ranges[i]
if df2.index[r[0]].date() < start_min:
logger.debug(f'Pruning {c} range {df2.index[r[0]]}->{df2.index[r[1]-1]} because too old.', extra=log_extras)
del ranges[i]
if len(ranges) > 0:
OHLC_correct_ranges[j] = ranges
count = sum([1 if x is not None else 0 for x in OHLC_correct_ranges])
if count == 0:
pass
elif count == 1:
# If only 1 column then assume false positive
idxs = [i if OHLC_correct_ranges[i] else -1 for i in range(len(OHLC))]
idx = np.where(np.array(idxs) != -1)[0][0]
col = OHLC[idx]
logger.debug(f'Potential {fix_type} detected only in column {col}, so treating as false positive (ignore)', extra=log_extras)
else:
# Only correct if at least 2 columns require correction.
n_corrected = [0,0,0,0]
for j in range(len(OHLC)):
c = OHLC[j]
ranges = OHLC_correct_ranges[j]
if ranges is None:
ranges = []
for r in ranges:
if r[2] == 'split':
m = split
m_rcp = split_rcp
else:
m = split_rcp
m_rcp = split
any_m_lt_1 = any_m_lt_1 or m < 0.99
if interday:
msg = f"Corrected {fix_type} on col={c} range=[{df2.index[r[1]-1].date()}:{df2.index[r[0]].date()}] m={m:.4f}"
else:
msg = f"Corrected {fix_type} on col={c} range=[{df2.index[r[1]-1]}:{df2.index[r[0]]}] m={m:.4f}"
logger.debug(msg, extra=log_extras)
# Instead of logging here, just count
n_corrected[j] += r[1]-r[0]
df2.iloc[r[0]:r[1], df2.columns.get_loc(c)] *= m
if c == 'Close':
df2.iloc[r[0]:r[1], df2.columns.get_loc('Adj Close')] *= m
if correct_volume:
if c == 'Open':
f_open_fixed[r[0]:r[1]] = True
elif c == 'Close':
f_close_fixed[r[0]:r[1]] = True
f_corrected[r[0]:r[1]] = True
if sum(n_corrected) > 0:
counts_pretty = ''
for j in range(len(OHLC)):
if n_corrected[j] != 0:
if counts_pretty != '':
counts_pretty += ', '
counts_pretty += f'{OHLC[j]}={n_corrected[j]}x'
msg = f"Corrected: {counts_pretty}"
logger.info(msg, extra=log_extras)
if correct_volume:
f_open_and_closed_fixed = f_open_fixed & f_close_fixed
f_open_xor_closed_fixed = np.logical_xor(f_open_fixed, f_close_fixed)
if f_open_and_closed_fixed.any():
df2.loc[f_open_and_closed_fixed, "Volume"] = (df2.loc[f_open_and_closed_fixed, "Volume"] * m_rcp).round().astype('int')
if f_open_xor_closed_fixed.any():
df2.loc[f_open_xor_closed_fixed, "Volume"] = (df2.loc[f_open_xor_closed_fixed, "Volume"] * 0.5 * m_rcp).round().astype('int')
sudden_change_repaired[f_corrected] = True
else:
n_corrected = 0
idx_first_f = np.where(f)[0][0]
if appears_suspended and (idx_latest_active is not None and idx_latest_active >= idx_first_f):
# Suspended midway during data date range.
# 1: process data before suspension in index-ascending (date-descending) order.
# 2: process data after suspension in index-descending order. Requires signals to be reversed,
# then returned ranges to also be reversed, because this logic was originally written for
# index-ascending (date-descending) order.
ranges_before = map_signals_to_ranges(f[idx_latest_active:], f_up[idx_latest_active:], f_down[idx_latest_active:])
if len(ranges_before) > 0:
# Shift each range back to global indexing
for i in range(len(ranges_before)):
r = ranges_before[i]
ranges_before[i] = (r[0] + idx_latest_active, r[1] + idx_latest_active, r[2])
f_rev_down = np.flip(np.roll(f_up, -1))
f_rev_up = np.flip(np.roll(f_down, -1))
f_rev = f_rev_up | f_rev_down
ranges_after = map_signals_to_ranges(f_rev[idx_rev_latest_active:], f_rev_up[idx_rev_latest_active:], f_rev_down[idx_rev_latest_active:])
if len(ranges_after) > 0:
# Shift each range back to global indexing:
for i in range(len(ranges_after)):
r = ranges_after[i]
ranges_after[i] = (r[0] + idx_rev_latest_active, r[1] + idx_rev_latest_active, r[2])
# Flip range to normal ordering
for i in range(len(ranges_after)):
r = ranges_after[i]
ranges_after[i] = (n-r[1], n-r[0], r[2])
ranges = ranges_before
ranges.extend(ranges_after)
else:
ranges = map_signals_to_ranges(f, f_up, f_down)
if start_min is not None:
# Prune ranges that are older than start_min
for i in range(len(ranges)-1, -1, -1):
r = ranges[i]
if df2.index[r[0]].date() < start_min:
logger.debug(f'Pruning range {df2.index[r[0]]}->{df2.index[r[1]-1]} because too old.', extra=log_extras)
del ranges[i]
for r in ranges:
if r[2] == 'split':
m = split
m_rcp = split_rcp
else:
m = split_rcp
m_rcp = split
any_m_lt_1 = any_m_lt_1 or m < 0.99
logger.debug(f"range={r} m={m}", extra=log_extras)
for c in ['Open', 'High', 'Low', 'Close', 'Adj Close']:
df2.iloc[r[0]:r[1], df2.columns.get_loc(c)] *= m
if correct_dividend:
df2.iloc[r[0]:r[1], df2.columns.get_loc('Dividends')] *= m
if correct_volume:
col_loc = df2.columns.get_loc("Volume")
df2.iloc[r[0]:r[1], col_loc] = (df2.iloc[r[0]:r[1], col_loc] * m_rcp).round().astype('int')
sudden_change_repaired[r[0]:r[1]] = True
if r[0] == r[1] - 1:
if interday:
msg = f"Corrected {fix_type} on interval {df2.index[r[0]].date()}"
else:
msg = f"Corrected {fix_type} on interval {df2.index[r[0]]}"
else:
# Note: df2 sorted with index descending
start = df2.index[r[1] - 1]
end = df2.index[r[0]]
if interday:
msg = f"Corrected {fix_type} across intervals {start.date()} -> {end.date()} (inclusive)"
else:
msg = f"Corrected {fix_type} across intervals {start} -> {end} (inclusive)"
logger.debug(msg, extra=log_extras)
n_corrected += r[1] - r[0]
if len(ranges) <= 2:
msg = "Corrected:"
for r in ranges:
msg += f" {df2.index[r[1]-1].date()} -> {df2.index[r[0]].date()}"
else:
msg = f"Corrected: {n_corrected}x"
logger.info(msg, extra=log_extras)
if unit_switch and any_m_lt_1:
# m < 1 means thats the switch was repaired in favour of the major currency
# e.g. USD beat cents
# But check if _standardise_currency() already did that.
if 'currencyRepaired' in self._history_metadata and self._history_metadata['currencyRepaired']:
# Yes it did, which means this repair did it again.
# Revert the second.
m = change
m_rcp = 1.0/change
for c in ['Open', 'High', 'Low', 'Close', 'Adj Close']:
df2[c] *= m
if correct_dividend:
df2['Dividends'] *= m
if correct_volume:
df2['Volume'] = (df2['Volume'] * m_rcp).round().astype('int')
sudden_change_repaired = ~sudden_change_repaired
if 'Repaired?' not in df2.columns:
df2['Repaired?'] = False
df2['Repaired?'] = df2['Repaired?'].to_numpy() | sudden_change_repaired
if correct_volume:
f_na = df2['Volume'].isna()
if f_na.any():
df2.loc[~f_na,'Volume'] = df2['Volume'][~f_na].round(0).astype('int')
else:
df2['Volume'] = df2['Volume'].round(0).astype('int')
return df2.sort_index()