diff --git a/pure_ocean_breeze/__init__.py b/pure_ocean_breeze/__init__.py index c3ee7e4..f015882 100644 --- a/pure_ocean_breeze/__init__.py +++ b/pure_ocean_breeze/__init__.py @@ -2,8 +2,8 @@ 一个量化多因子研究的框架,包含数据、回测、因子加工等方面的功能 """ -__updated__ = "2022-09-15 23:53:25" -__version__ = "3.2.4" +__updated__ = "2022-09-16 10:32:33" +__version__ = "3.2.5" __author__ = "chenzongwei" __author_email__ = "winterwinter999@163.com" __url__ = "https://github.com/chen-001/pure_ocean_breeze" diff --git a/pure_ocean_breeze/data/read_data.py b/pure_ocean_breeze/data/read_data.py index 7d4dcd4..d8ea65e 100644 --- a/pure_ocean_breeze/data/read_data.py +++ b/pure_ocean_breeze/data/read_data.py @@ -1,4 +1,4 @@ -__updated__ = "2022-09-13 18:05:53" +__updated__ = "2022-09-16 10:32:20" import os import numpy as np @@ -81,86 +81,70 @@ def read_daily( if path: return pd.read_feather(homeplace.daily_data_file + path).set_index("date") elif open: - opens = pd.read_feather( - homeplace.daily_data_file + "opens.feather" - ).set_index("date") + opens = pd.read_feather(homeplace.daily_data_file + "opens.feather") df = opens + df = df.set_index(list(df.columns)[0]) elif close: - closes = pd.read_feather( - homeplace.daily_data_file + "closes.feather" - ).set_index("date") + closes = pd.read_feather(homeplace.daily_data_file + "closes.feather") df = closes + df = df.set_index(list(df.columns)[0]) elif high: - highs = pd.read_feather( - homeplace.daily_data_file + "highs.feather" - ).set_index("date") + highs = pd.read_feather(homeplace.daily_data_file + "highs.feather") df = highs + df = df.set_index(list(df.columns)[0]) elif low: - lows = pd.read_feather( - homeplace.daily_data_file + "lows.feather" - ).set_index("date") + lows = pd.read_feather(homeplace.daily_data_file + "lows.feather") df = lows + df = df.set_index(list(df.columns)[0]) elif tr: - trs = pd.read_feather(homeplace.daily_data_file + "trs.feather").set_index( - "date" - ) + trs = pd.read_feather(homeplace.daily_data_file + "trs.feather") df = trs + df = df.set_index(list(df.columns)[0]) elif sharenum: - sharenums = pd.read_feather( - homeplace.daily_data_file + "sharenums.feather" - ).set_index("date") + sharenums = pd.read_feather(homeplace.daily_data_file + "sharenums.feather") df = sharenums + df = df.set_index(list(df.columns)[0]) elif volume: - volumes = pd.read_feather( - homeplace.daily_data_file + "volumes.feather" - ).set_index("date") + volumes = pd.read_feather(homeplace.daily_data_file + "volumes.feather") df = volumes + df = df.set_index(list(df.columns)[0]) elif age: - age = pd.read_feather(homeplace.daily_data_file + "ages.feather").set_index( - "date" - ) + age = pd.read_feather(homeplace.daily_data_file + "ages.feather") df = age + df = df.set_index(list(df.columns)[0]) elif flow_cap: - closes = pd.read_feather( - homeplace.daily_data_file + "closes_unadj.feather" - ).set_index("date") - sharenums = pd.read_feather( - homeplace.daily_data_file + "sharenums.feather" - ).set_index("date") + closes = pd.read_feather(homeplace.daily_data_file + "closes_unadj.feather") + sharenums = pd.read_feather(homeplace.daily_data_file + "sharenums.feather") + closes = closes.set_index(list(closes.columns)[0]) + sharenums = sharenums.set_index(list(sharenums.columns)[0]) flow_cap = closes * sharenums df = flow_cap elif st: - st = pd.read_feather(homeplace.daily_data_file + "sts.feather").set_index( - "date" - ) + st = pd.read_feather(homeplace.daily_data_file + "sts.feather") df = st + df = df.set_index(list(df.columns)[0]) elif state: - state = pd.read_feather( - homeplace.daily_data_file + "states.feather" - ).set_index("date") + state = pd.read_feather(homeplace.daily_data_file + "states.feather") df = state + df = df.set_index(list(df.columns)[0]) else: raise IOError("阁下总得读点什么吧?🤒") else: if open: - opens = pd.read_feather( - homeplace.daily_data_file + "opens.feather" - ).set_index("date") + opens = pd.read_feather(homeplace.daily_data_file + "opens.feather") df = opens + df = df.set_index(list(df.columns)[0]) elif close: - closes = pd.read_feather( - homeplace.daily_data_file + "closes.feather" - ).set_index("date") + closes = pd.read_feather(homeplace.daily_data_file + "closes.feather") df = closes + df = df.set_index(list(df.columns)[0]) elif high: - highs = pd.read_feather( - homeplace.daily_data_file + "highs.feather" - ).set_index("date") + highs = pd.read_feather(homeplace.daily_data_file + "highs.feather") df = highs + df = df.set_index(list(df.columns)[0]) elif low: - lows = pd.read_feather( - homeplace.daily_data_file + "lows.feather" - ).set_index("date") + lows = pd.read_feather(homeplace.daily_data_file + "lows.feather") + df = df.set_index(list(df.columns)[0]) df = lows else: raise IOError("阁下总得读点什么吧?🤒") diff --git a/pure_ocean_breeze/labor/process.py b/pure_ocean_breeze/labor/process.py index 6ec0328..73354c3 100644 --- a/pure_ocean_breeze/labor/process.py +++ b/pure_ocean_breeze/labor/process.py @@ -1,4 +1,4 @@ -__updated__ = "2022-09-15 22:05:26" +__updated__ = "2022-09-16 10:38:19" import numpy as np import pandas as pd @@ -1327,8 +1327,6 @@ class pure_moon(object): def __init__( cls, startdate: int, - zxindustry_dummies=0, - swindustry_dummies=0, ): cls.homeplace = HomePlace() # 已经算好的月度st状态文件 @@ -1336,30 +1334,37 @@ def __init__( # 已经算好的月度交易状态文件 cls.states_monthly_file = homeplace.daily_data_file + "states_monthly.feather" - if swindustry_dummies: - cls.industry_dummy = ( - pd.read_feather(cls.homeplace.daily_data_file + "申万行业2021版哑变量.feather") - .set_index("date") - .groupby("code") - .resample("M") - .last() - ) - else: - cls.industry_dummy = ( - pd.read_feather(cls.homeplace.daily_data_file + "中信一级行业哑变量代码版.feather") - .fillna(0) - .set_index("date") - .groupby("code") - .resample("M") - .last() - ) - cls.industry_dummy = cls.industry_dummy.drop(columns=["code"]).reset_index() - cls.industry_ws = [f"w{i}" for i in range(1, cls.industry_dummy.shape[1] - 1)] - col = ["code", "date"] + cls.industry_ws - cls.industry_dummy.columns = col - cls.industry_dummy = cls.industry_dummy[ - cls.industry_dummy.date >= pd.Timestamp(str(startdate)) - ] + cls.swindustry_dummy = ( + pd.read_feather(cls.homeplace.daily_data_file + "申万行业2021版哑变量.feather") + .fillna(0) + .set_index("date") + .groupby("code") + .resample("M") + .last() + ) + + cls.zxindustry_dummy = ( + pd.read_feather(cls.homeplace.daily_data_file + "中信一级行业哑变量代码版.feather") + .fillna(0) + .set_index("date") + .groupby("code") + .resample("M") + .last() + .fillna(0) + ) + + def deal_dummy(industry_dummy): + industry_dummy = industry_dummy.drop(columns=["code"]).reset_index() + industry_ws = [f"w{i}" for i in range(1, industry_dummy.shape[1] - 1)] + col = ["code", "date"] + industry_ws + industry_dummy.columns = col + industry_dummy = industry_dummy[ + industry_dummy.date >= pd.Timestamp(str(startdate)) + ] + return industry_dummy + + cls.swindustry_dummy = deal_dummy(cls.swindustry_dummy) + cls.zxindustry_dummy = deal_dummy(cls.zxindustry_dummy) def __call__(self): """调用对象则返回因子值""" @@ -1513,7 +1518,11 @@ def single(x): else: cls.cap["cap_size"] = np.log(cls.cap["cap_size"]) - def get_neutral_factors(self): + def get_neutral_factors( + self, + zxindustry_dummies=0, + swindustry_dummies=0, + ): """对因子进行市值中性化""" self.factors = self.factors.set_index("date") self.factors.index = self.factors.index + pd.DateOffset(months=1) @@ -1530,7 +1539,14 @@ def get_neutral_factors(self): self.factors = pd.merge( self.factors, self.cap, how="inner", on=["date", "code"] ) - self.factors = pd.merge(self.factors, self.industry_dummy, on=["date", "code"]) + if swindustry_dummies: + self.factors = pd.merge( + self.factors, self.swindustry_dummy, on=["date", "code"] + ) + else: + self.factors = pd.merge( + self.factors, self.zxindustry_dummy, on=["date", "code"] + ) self.factors = self.factors.set_index(["date", "code"]) self.factors = self.factors.groupby(["date"]).apply(self.neutralize_factors) self.factors = self.factors.reset_index() @@ -1847,6 +1863,8 @@ def run( rets_sheetname=None, on_paper=False, sheetname=None, + zxindustry_dummies=0, + swindustry_dummies=0, ): """运行回测部分""" if comments_writer and not (comments_sheetname or sheetname): @@ -1857,11 +1875,17 @@ def run( raise IOError("把group_rets输出到excel中时,必须指定sheetname🤒") if neutralize: self.get_log_cap() - self.get_neutral_factors() + self.get_neutral_factors( + swindustry_dummies=swindustry_dummies, + zxindustry_dummies=zxindustry_dummies, + ) self.deal_with_factors_after_neutralize() elif boxcox: self.get_log_cap(boxcox=True) - self.get_neutral_factors() + self.get_neutral_factors( + swindustry_dummies=swindustry_dummies, + zxindustry_dummies=zxindustry_dummies, + ) self.deal_with_factors_after_neutralize() else: self.deal_with_factors() @@ -2087,8 +2111,6 @@ def __init__( capitals = read_daily(flow_cap=1, start=start).resample("M").last() self.shen = pure_moon( startdate=start, - zxindustry_dummies=zxindustry_dummies, - swindustry_dummies=swindustry_dummies, ) self.shen.set_basic_data( age=ages, @@ -2120,6 +2142,8 @@ def __init__( rets_sheetname=rets_sheetname, on_paper=on_paper, sheetname=sheetname, + swindustry_dummies=swindustry_dummies, + zxindustry_dummies=zxindustry_dummies, ) def __call__(self) -> pd.DataFrame: diff --git "a/\346\233\264\346\226\260\346\227\245\345\277\227/version3.md" "b/\346\233\264\346\226\260\346\227\245\345\277\227/version3.md" index fa8bf6b..6d7ae6c 100644 --- "a/\346\233\264\346\226\260\346\227\245\345\277\227/version3.md" +++ "b/\346\233\264\346\226\260\346\227\245\345\277\227/version3.md" @@ -1,5 +1,9 @@ ## 更新日志🗓 — v3 +* v3.2.5 — 2022.09.16 +> 1. 修复了读取日频数据函数read_daily由于索引名称更改导致的bug +> 1. 修复了缓存机制导致同一内核中,无法转换中信行业和申万行业的bug +> 1. 给用clickhouse的分钟数据计算因子值的类pure_fall_frequent增加了notebook进度条功能,当tqdm_inside指定为-1时,即使用tqdm.tqdm_notebook功能 * v3.2.4 — 2022.09.15 > 1. 改善了以clickhouse和questdb分钟数据计算因子的循环逻辑,将需要计算的时间拆分为多段相邻时间来计算,并补充了起始第一天的计算 > 1. 将保存最终因子值的函数database_save_final_factors增加了去除全空行的功能