From 6ea3846c5ca501be4aa4c3821deb057440b96ee0 Mon Sep 17 00:00:00 2001 From: chenzongwei <63836858+chen-001@users.noreply.github.com> Date: Wed, 26 Oct 2022 19:59:47 +0800 Subject: [PATCH] v3.4.1 --- pure_ocean_breeze/__init__.py | 4 +- pure_ocean_breeze/data/tools.py | 107 +++++++++++++++--- pure_ocean_breeze/labor/process.py | 71 ++++++++---- .../version3.md" | 5 + 4 files changed, 145 insertions(+), 42 deletions(-) diff --git a/pure_ocean_breeze/__init__.py b/pure_ocean_breeze/__init__.py index 4bff2bb..b47ca78 100644 --- a/pure_ocean_breeze/__init__.py +++ b/pure_ocean_breeze/__init__.py @@ -2,8 +2,8 @@ 一个量化多因子研究的框架,包含数据、回测、因子加工等方面的功能 """ -__updated__ = "2022-10-25 07:45:37" -__version__ = "3.4.0" +__updated__ = "2022-10-26 10:56:42" +__version__ = "3.4.1" __author__ = "chenzongwei" __author_email__ = "winterwinter999@163.com" __url__ = "https://github.com/chen-001/pure_ocean_breeze" diff --git a/pure_ocean_breeze/data/tools.py b/pure_ocean_breeze/data/tools.py index f21713a..846f7f9 100644 --- a/pure_ocean_breeze/data/tools.py +++ b/pure_ocean_breeze/data/tools.py @@ -2,8 +2,9 @@ 针对一些不常见的文件格式,读取数据文件的一些工具函数,以及其他数据工具 """ -__updated__ = "2022-10-24 20:33:41" +__updated__ = "2022-10-26 18:51:43" +import os import h5py import pandas as pd import tqdm @@ -13,6 +14,7 @@ import numpy_ext as npext import scipy.stats as ss from functools import reduce, partial +from loguru import logger from typing import Callable, Union try: @@ -21,6 +23,7 @@ rqdatac.init() except Exception: print("暂时未连接米筐") +from pure_ocean_breeze.state.homeplace import HomePlace def read_h5(path: str) -> dict: @@ -377,7 +380,11 @@ def merge_many(dfs: list[pd.DataFrame], names: list = None) -> pd.DataFrame: def corr_two_daily( - df1: pd.DataFrame, df2: pd.DataFrame, rolling_window: int = 20, n_jobs: int = 6 + df1: pd.DataFrame, + df2: pd.DataFrame, + history: str = None, + rolling_window: int = 20, + n_jobs: int = 6, ) -> pd.DataFrame: """求两个因子,在相同股票上,时序上滚动窗口下的相关系数 @@ -387,6 +394,8 @@ def corr_two_daily( 第一个因子,index为时间,columns为股票代码 df2 : pd.DataFrame 第二个因子,index为时间,columns为股票代码 + history : str, optional + 从某处读取计算好的历史文件 rolling_window : int, optional 滚动窗口, by default 20 n_jobs : int, optional @@ -402,12 +411,21 @@ def corr_in(a, b, c): return c.iloc[-1], np.corrcoef(a, b)[0, 1] return func_two_daily( - df1=df1, df2=df2, func=corr_in, rolling_window=rolling_window, n_jobs=n_jobs + df1=df1, + df2=df2, + func=corr_in, + history=history, + rolling_window=rolling_window, + n_jobs=n_jobs, ) def cov_two_daily( - df1: pd.DataFrame, df2: pd.DataFrame, rolling_window: int = 20, n_jobs: int = 6 + df1: pd.DataFrame, + df2: pd.DataFrame, + history: str = None, + rolling_window: int = 20, + n_jobs: int = 6, ) -> pd.DataFrame: """求两个因子,在相同股票上,时序上滚动窗口下的协方差 @@ -417,6 +435,8 @@ def cov_two_daily( 第一个因子,index为时间,columns为股票代码 df2 : pd.DataFrame 第二个因子,index为时间,columns为股票代码 + history : str, optional + 从某处读取计算好的历史文件 rolling_window : int, optional 滚动窗口, by default 20 n_jobs : int, optional @@ -432,7 +452,12 @@ def cov_in(a, b, c): return c.iloc[-1], np.cov(a, b)[0, 1] return func_two_daily( - df1=df1, df2=df2, func=cov_in, rolling_window=rolling_window, n_jobs=n_jobs + df1=df1, + df2=df2, + func=cov_in, + history=history, + rolling_window=rolling_window, + n_jobs=n_jobs, ) @@ -440,6 +465,7 @@ def func_two_daily( df1: pd.DataFrame, df2: pd.DataFrame, func: Callable, + history: str = None, rolling_window: int = 20, n_jobs: int = 6, ) -> pd.DataFrame: @@ -453,6 +479,8 @@ def func_two_daily( 第二个因子,index为时间,columns为股票代码 func : Callable 要对两列数进行操作的函数 + history : str, optional + 从某处读取计算好的历史文件 rolling_window : int, optional 滚动窗口, by default 20 n_jobs : int, optional @@ -474,17 +502,64 @@ def func_rolling(df): ) return df - twins = merge_many([df1, df2]) - tqdm.tqdm.pandas() - corrs = twins.groupby(["code"]).progress_apply(func_rolling) - cor = [] - for i in range(len(corrs)): - df = pd.DataFrame(corrs.iloc[i]).dropna().assign(code=corrs.index[i]) - cor.append(df) - cors = pd.concat(cor) - cors.columns = ["date", "corr", "code"] - cors = cors.pivot(index="date", columns="code", values="corr") - return cors + homeplace = HomePlace() + if history is not None: + if os.path.exists(homeplace.update_data_file + history): + old = pd.read_feather(homeplace.update_data_file + history) + old = old.set_index(list(old.columns)[0]) + new_end = min(df1.index.max(), df2.index.max()) + if new_end > old.index.max(): + old_end = datetime.datetime.strftime(old.index.max(), "%Y%m%d") + logger.info(f"上次更新到了{old_end}") + df1a = df1[df1.index <= old.index.max()].tail(rolling_window - 1) + df1b = df1[df1.index > old.index.max()] + df1 = pd.concat([df1a, df1b]) + df2a = df2[df2.index <= old.index.max()].tail(rolling_window - 1) + df2b = df2[df2.index > old.index.max()] + df2 = pd.concat([df2a, df2b]) + twins = merge_many([df1, df2]) + tqdm.tqdm.pandas() + corrs = twins.groupby(["code"]).progress_apply(func_rolling) + cor = [] + for i in range(len(corrs)): + df = ( + pd.DataFrame(corrs.iloc[i]).dropna().assign(code=corrs.index[i]) + ) + cor.append(df) + cors = pd.concat(cor) + cors.columns = ["date", "corr", "code"] + cors = cors.pivot(index="date", columns="code", values="corr") + if history is not None: + if os.path.exists(homeplace.update_data_file + history): + cors = pd.concat([old, cors]) + cors = drop_duplicates_index(cors) + cors.reset_index().to_feather(homeplace.update_data_file + history) + new_end = datetime.datetime.strftime(cors.index.max(), "%Y%m%d") + logger.info(f"已经更新至{new_end}") + return cors + else: + logger.info(f"已经是最新的了") + return old + else: + logger.info("第一次计算,请耐心等待,计算完成后将存储") + twins = merge_many([df1, df2]) + tqdm.tqdm.pandas() + corrs = twins.groupby(["code"]).progress_apply(func_rolling) + cor = [] + for i in range(len(corrs)): + df = pd.DataFrame(corrs.iloc[i]).dropna().assign(code=corrs.index[i]) + cor.append(df) + cors = pd.concat(cor) + cors.columns = ["date", "corr", "code"] + cors = cors.pivot(index="date", columns="code", values="corr") + if history is not None: + if os.path.exists(homeplace.update_data_file + history): + cors = pd.concat([old, cors]) + cors = drop_duplicates_index(cors) + cors.reset_index().to_feather(homeplace.update_data_file + history) + new_end = datetime.datetime.strftime(cors.index.max(), "%Y%m%d") + logger.info(f"已经更新至{new_end}") + return cors def drop_duplicates_index(new: pd.DataFrame) -> pd.DataFrame: diff --git a/pure_ocean_breeze/labor/process.py b/pure_ocean_breeze/labor/process.py index 0f01e27..93484c1 100644 --- a/pure_ocean_breeze/labor/process.py +++ b/pure_ocean_breeze/labor/process.py @@ -1,4 +1,4 @@ -__updated__ = "2022-10-11 12:35:40" +__updated__ = "2022-10-26 18:52:42" import warnings @@ -999,6 +999,7 @@ def show_corrs( factor_names: list[str] = None, print_bool: bool = True, show_percent: bool = True, + method: str='spearman', ) -> pd.DataFrame: """展示很多因子两两之间的截面相关性 @@ -1012,6 +1013,8 @@ def show_corrs( 是否打印出两两之间相关系数的表格, by default True show_percent : bool, optional 是否以百分数的形式展示, by default True + method : str, optional + 计算相关系数的方法, by default "spearman" Returns ------- @@ -1022,7 +1025,7 @@ def show_corrs( for i in range(len(factors)): main_i = factors[i] follows = factors[i + 1 :] - corr = [show_corr(main_i, i, plt_plot=False) for i in follows] + corr = [show_corr(main_i, i, plt_plot=False,method=method) for i in follows] corr = [np.nan] * (i + 1) + corr corrs.append(corr) if factor_names is None: @@ -1062,12 +1065,27 @@ def de_cross( return (y - xs)() -def show_corrs_with_old(df): - df0=df.resample('M').last() - if df.shape[0]/df0.shape[0]>2: - daily=1 - else: - daily=0 +def show_corrs_with_old(df:pd.DataFrame=None,method:str='spearman')->pd.DataFrame: + """计算新因子和已有因子的相关系数 + + Parameters + ---------- + df : pd.DataFrame, optional + 新因子, by default None + method : str, optional + 求相关系数的方法, by default 'spearman' + + Returns + ------- + pd.DataFrame + 相关系数矩阵 + """ + if df is not None: + df0=df.resample('M').last() + if df.shape[0]/df0.shape[0]>2: + daily=1 + else: + daily=0 olds=[] for i in range(1,100): try: @@ -1078,8 +1096,11 @@ def show_corrs_with_old(df): olds.append(old) except Exception: break - olds=[df]+olds - corrs=show_corrs(olds,['new']+[f'old{i}' for i in range(1,len(olds))]) + if df is not None: + olds=[df]+olds + corrs=show_corrs(olds,['new']+[f'old{i}' for i in range(1,len(olds))],method=method) + else: + corrs=show_corrs(olds,[f'old{i}' for i in range(1,len(olds))],method=method) return corrs @@ -4214,7 +4235,7 @@ def ols_in(self, df): ... -def test_on_300500(df:pd.DataFrame,hs300:bool=0,zz500:bool=0,zz1000:bool=0,zz2000:bool=0)->pd.Series: +def test_on_300500(df:pd.DataFrame,hs300:bool=0,zz500:bool=0,zz1000:bool=0,iplot:bool=1)->pd.Series: """对因子在指数成分股内进行多空和多头测试 Parameters @@ -4227,33 +4248,35 @@ def test_on_300500(df:pd.DataFrame,hs300:bool=0,zz500:bool=0,zz1000:bool=0,zz200 在中证500成分股内测试, by default 0 zz1000 : bool, optional 在中证1000成分股内测试, by default 0 - zz2000 : bool, optional - 在国证2000成分股内测试, by default 0 + iplot : bol,optional + 多空回测的时候,是否使用cufflinks绘画 Returns ------- pd.Series 多头组在该指数上的超额收益序列 """ - fi300=daily_factor_on300500(df,hs300=hs300,zz500=zz500,zz1000=zz1000,zz2000=zz2000) - shen=pure_moonnight(fi300) + fi300=daily_factor_on300500(df,hs300=hs300,zz500=zz500,zz1000=zz1000) + shen=pure_moonnight(fi300,iplot=iplot) if shen.shen.group_net_values.group1.iloc[-1]>shen.shen.group_net_values.group10.iloc[-1]: - print(make_relative_comments(shen.shen.group_rets.group1,hs300=hs300,zz500=zz500,zz1000=zz1000,zz2000=zz2000)) - abrets=make_relative_comments_plot(shen.shen.group_rets.group1,hs300=hs300,zz500=zz500,zz1000=zz1000,zz2000=zz2000) + print(make_relative_comments(shen.shen.group_rets.group1,hs300=hs300,zz500=zz500,zz1000=zz1000)) + abrets=make_relative_comments_plot(shen.shen.group_rets.group1,hs300=hs300,zz500=zz500,zz1000=zz1000) return abrets else: - print(make_relative_comments(shen.shen.group_rets.group10,hs300=hs300,zz500=zz500,zz1000=zz1000,zz2000=zz2000)) - abrets=make_relative_comments_plot(shen.shen.group_rets.group10,hs300=hs300,zz500=zz500,zz1000=zz1000,zz2000=zz2000) + print(make_relative_comments(shen.shen.group_rets.group10,hs300=hs300,zz500=zz500,zz1000=zz1000)) + abrets=make_relative_comments_plot(shen.shen.group_rets.group10,hs300=hs300,zz500=zz500,zz1000=zz1000) return abrets -def test_on_index_four(df:pd.DataFrame,gz2000:bool=0)->pd.DataFrame: +def test_on_index_four(df:pd.DataFrame,iplot:bool=1,gz2000:bool=0)->pd.DataFrame: """对因子同时在沪深300、中证500、中证1000、国证2000这4个指数成分股内进行多空和多头超额测试 Parameters ---------- df : pd.DataFrame 因子值,index为时间,columns为股票代码 + iplot : bol,optional + 多空回测的时候,是否使用cufflinks绘画 gz2000 : bool, optional 是否进行国证2000上的测试, by default 0 @@ -4262,11 +4285,11 @@ def test_on_index_four(df:pd.DataFrame,gz2000:bool=0)->pd.DataFrame: pd.DataFrame 多头组在各个指数上的超额收益序列 """ - abrets300=test_on_300500(df,hs300=1).to_frame('沪深300') - abrets500=test_on_300500(df,zz500=1).to_frame('中证500') - abrets1000=test_on_300500(df,zz1000=1).to_frame('中证1000') + abrets300=test_on_300500(df,hs300=1,iplot=iplot).to_frame('沪深300') + abrets500=test_on_300500(df,zz500=1,iplot=iplot).to_frame('中证500') + abrets1000=test_on_300500(df,zz1000=1,iplot=iplot).to_frame('中证1000') if gz2000: - abrets2000=test_on_300500(df,gz2000=1).to_frame('国证2000') + abrets2000=test_on_300500(df,gz2000=1,iplot=iplot).to_frame('国证2000') abrs=pd.concat([abrets300,abrets500,abrets1000,abrets2000],axis=1) else: abrs=pd.concat([abrets300,abrets500,abrets1000],axis=1) diff --git "a/\346\233\264\346\226\260\346\227\245\345\277\227/version3.md" "b/\346\233\264\346\226\260\346\227\245\345\277\227/version3.md" index 5cd2057..3a06ac2 100644 --- "a/\346\233\264\346\226\260\346\227\245\345\277\227/version3.md" +++ "b/\346\233\264\346\226\260\346\227\245\345\277\227/version3.md" @@ -1,5 +1,10 @@ ## 更新日志🗓 — v3 +* v3.4.1 — 2022.10.26 +> 1. 给func_two_daily、corr_two_daily、cov_two_daily增加了history参数,用于将计算出的结果记录在本地 +> 1. 给show_corrs、show_corrs_with_old函数增加了method参数,可以修改求相关系数的方式 +> 1. 暂时删去了test_on_300500的国证2000的参数 +> 1. 给test_on_300500和test_on_index_four新增了iplot参数,决定是否使用cufflinks画图 * v3.4.0 — 2022.10.25 > 1. 修复了拼接多个dataframe的函数merge_many中的bug > 1. 修复了导入process模块时的bug