Skip to content

Commit

Permalink
v3.4.1
Browse files Browse the repository at this point in the history
  • Loading branch information
chen-001 committed Oct 26, 2022
1 parent 4a5bb42 commit 6ea3846
Show file tree
Hide file tree
Showing 4 changed files with 145 additions and 42 deletions.
4 changes: 2 additions & 2 deletions pure_ocean_breeze/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
一个量化多因子研究的框架,包含数据、回测、因子加工等方面的功能
"""

__updated__ = "2022-10-25 07:45:37"
__version__ = "3.4.0"
__updated__ = "2022-10-26 10:56:42"
__version__ = "3.4.1"
__author__ = "chenzongwei"
__author_email__ = "[email protected]"
__url__ = "https://github.com/chen-001/pure_ocean_breeze"
Expand Down
107 changes: 91 additions & 16 deletions pure_ocean_breeze/data/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
针对一些不常见的文件格式,读取数据文件的一些工具函数,以及其他数据工具
"""

__updated__ = "2022-10-24 20:33:41"
__updated__ = "2022-10-26 18:51:43"

import os
import h5py
import pandas as pd
import tqdm
Expand All @@ -13,6 +14,7 @@
import numpy_ext as npext
import scipy.stats as ss
from functools import reduce, partial
from loguru import logger
from typing import Callable, Union

try:
Expand All @@ -21,6 +23,7 @@
rqdatac.init()
except Exception:
print("暂时未连接米筐")
from pure_ocean_breeze.state.homeplace import HomePlace


def read_h5(path: str) -> dict:
Expand Down Expand Up @@ -377,7 +380,11 @@ def merge_many(dfs: list[pd.DataFrame], names: list = None) -> pd.DataFrame:


def corr_two_daily(
df1: pd.DataFrame, df2: pd.DataFrame, rolling_window: int = 20, n_jobs: int = 6
df1: pd.DataFrame,
df2: pd.DataFrame,
history: str = None,
rolling_window: int = 20,
n_jobs: int = 6,
) -> pd.DataFrame:
"""求两个因子,在相同股票上,时序上滚动窗口下的相关系数
Expand All @@ -387,6 +394,8 @@ def corr_two_daily(
第一个因子,index为时间,columns为股票代码
df2 : pd.DataFrame
第二个因子,index为时间,columns为股票代码
history : str, optional
从某处读取计算好的历史文件
rolling_window : int, optional
滚动窗口, by default 20
n_jobs : int, optional
Expand All @@ -402,12 +411,21 @@ def corr_in(a, b, c):
return c.iloc[-1], np.corrcoef(a, b)[0, 1]

return func_two_daily(
df1=df1, df2=df2, func=corr_in, rolling_window=rolling_window, n_jobs=n_jobs
df1=df1,
df2=df2,
func=corr_in,
history=history,
rolling_window=rolling_window,
n_jobs=n_jobs,
)


def cov_two_daily(
df1: pd.DataFrame, df2: pd.DataFrame, rolling_window: int = 20, n_jobs: int = 6
df1: pd.DataFrame,
df2: pd.DataFrame,
history: str = None,
rolling_window: int = 20,
n_jobs: int = 6,
) -> pd.DataFrame:
"""求两个因子,在相同股票上,时序上滚动窗口下的协方差
Expand All @@ -417,6 +435,8 @@ def cov_two_daily(
第一个因子,index为时间,columns为股票代码
df2 : pd.DataFrame
第二个因子,index为时间,columns为股票代码
history : str, optional
从某处读取计算好的历史文件
rolling_window : int, optional
滚动窗口, by default 20
n_jobs : int, optional
Expand All @@ -432,14 +452,20 @@ def cov_in(a, b, c):
return c.iloc[-1], np.cov(a, b)[0, 1]

return func_two_daily(
df1=df1, df2=df2, func=cov_in, rolling_window=rolling_window, n_jobs=n_jobs
df1=df1,
df2=df2,
func=cov_in,
history=history,
rolling_window=rolling_window,
n_jobs=n_jobs,
)


def func_two_daily(
df1: pd.DataFrame,
df2: pd.DataFrame,
func: Callable,
history: str = None,
rolling_window: int = 20,
n_jobs: int = 6,
) -> pd.DataFrame:
Expand All @@ -453,6 +479,8 @@ def func_two_daily(
第二个因子,index为时间,columns为股票代码
func : Callable
要对两列数进行操作的函数
history : str, optional
从某处读取计算好的历史文件
rolling_window : int, optional
滚动窗口, by default 20
n_jobs : int, optional
Expand All @@ -474,17 +502,64 @@ def func_rolling(df):
)
return df

twins = merge_many([df1, df2])
tqdm.tqdm.pandas()
corrs = twins.groupby(["code"]).progress_apply(func_rolling)
cor = []
for i in range(len(corrs)):
df = pd.DataFrame(corrs.iloc[i]).dropna().assign(code=corrs.index[i])
cor.append(df)
cors = pd.concat(cor)
cors.columns = ["date", "corr", "code"]
cors = cors.pivot(index="date", columns="code", values="corr")
return cors
homeplace = HomePlace()
if history is not None:
if os.path.exists(homeplace.update_data_file + history):
old = pd.read_feather(homeplace.update_data_file + history)
old = old.set_index(list(old.columns)[0])
new_end = min(df1.index.max(), df2.index.max())
if new_end > old.index.max():
old_end = datetime.datetime.strftime(old.index.max(), "%Y%m%d")
logger.info(f"上次更新到了{old_end}")
df1a = df1[df1.index <= old.index.max()].tail(rolling_window - 1)
df1b = df1[df1.index > old.index.max()]
df1 = pd.concat([df1a, df1b])
df2a = df2[df2.index <= old.index.max()].tail(rolling_window - 1)
df2b = df2[df2.index > old.index.max()]
df2 = pd.concat([df2a, df2b])
twins = merge_many([df1, df2])
tqdm.tqdm.pandas()
corrs = twins.groupby(["code"]).progress_apply(func_rolling)
cor = []
for i in range(len(corrs)):
df = (
pd.DataFrame(corrs.iloc[i]).dropna().assign(code=corrs.index[i])
)
cor.append(df)
cors = pd.concat(cor)
cors.columns = ["date", "corr", "code"]
cors = cors.pivot(index="date", columns="code", values="corr")
if history is not None:
if os.path.exists(homeplace.update_data_file + history):
cors = pd.concat([old, cors])
cors = drop_duplicates_index(cors)
cors.reset_index().to_feather(homeplace.update_data_file + history)
new_end = datetime.datetime.strftime(cors.index.max(), "%Y%m%d")
logger.info(f"已经更新至{new_end}")
return cors
else:
logger.info(f"已经是最新的了")
return old
else:
logger.info("第一次计算,请耐心等待,计算完成后将存储")
twins = merge_many([df1, df2])
tqdm.tqdm.pandas()
corrs = twins.groupby(["code"]).progress_apply(func_rolling)
cor = []
for i in range(len(corrs)):
df = pd.DataFrame(corrs.iloc[i]).dropna().assign(code=corrs.index[i])
cor.append(df)
cors = pd.concat(cor)
cors.columns = ["date", "corr", "code"]
cors = cors.pivot(index="date", columns="code", values="corr")
if history is not None:
if os.path.exists(homeplace.update_data_file + history):
cors = pd.concat([old, cors])
cors = drop_duplicates_index(cors)
cors.reset_index().to_feather(homeplace.update_data_file + history)
new_end = datetime.datetime.strftime(cors.index.max(), "%Y%m%d")
logger.info(f"已经更新至{new_end}")
return cors


def drop_duplicates_index(new: pd.DataFrame) -> pd.DataFrame:
Expand Down
71 changes: 47 additions & 24 deletions pure_ocean_breeze/labor/process.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__updated__ = "2022-10-11 12:35:40"
__updated__ = "2022-10-26 18:52:42"

import warnings

Expand Down Expand Up @@ -999,6 +999,7 @@ def show_corrs(
factor_names: list[str] = None,
print_bool: bool = True,
show_percent: bool = True,
method: str='spearman',
) -> pd.DataFrame:
"""展示很多因子两两之间的截面相关性
Expand All @@ -1012,6 +1013,8 @@ def show_corrs(
是否打印出两两之间相关系数的表格, by default True
show_percent : bool, optional
是否以百分数的形式展示, by default True
method : str, optional
计算相关系数的方法, by default "spearman"
Returns
-------
Expand All @@ -1022,7 +1025,7 @@ def show_corrs(
for i in range(len(factors)):
main_i = factors[i]
follows = factors[i + 1 :]
corr = [show_corr(main_i, i, plt_plot=False) for i in follows]
corr = [show_corr(main_i, i, plt_plot=False,method=method) for i in follows]
corr = [np.nan] * (i + 1) + corr
corrs.append(corr)
if factor_names is None:
Expand Down Expand Up @@ -1062,12 +1065,27 @@ def de_cross(
return (y - xs)()


def show_corrs_with_old(df):
df0=df.resample('M').last()
if df.shape[0]/df0.shape[0]>2:
daily=1
else:
daily=0
def show_corrs_with_old(df:pd.DataFrame=None,method:str='spearman')->pd.DataFrame:
"""计算新因子和已有因子的相关系数
Parameters
----------
df : pd.DataFrame, optional
新因子, by default None
method : str, optional
求相关系数的方法, by default 'spearman'
Returns
-------
pd.DataFrame
相关系数矩阵
"""
if df is not None:
df0=df.resample('M').last()
if df.shape[0]/df0.shape[0]>2:
daily=1
else:
daily=0
olds=[]
for i in range(1,100):
try:
Expand All @@ -1078,8 +1096,11 @@ def show_corrs_with_old(df):
olds.append(old)
except Exception:
break
olds=[df]+olds
corrs=show_corrs(olds,['new']+[f'old{i}' for i in range(1,len(olds))])
if df is not None:
olds=[df]+olds
corrs=show_corrs(olds,['new']+[f'old{i}' for i in range(1,len(olds))],method=method)
else:
corrs=show_corrs(olds,[f'old{i}' for i in range(1,len(olds))],method=method)
return corrs


Expand Down Expand Up @@ -4214,7 +4235,7 @@ def ols_in(self, df):
...


def test_on_300500(df:pd.DataFrame,hs300:bool=0,zz500:bool=0,zz1000:bool=0,zz2000:bool=0)->pd.Series:
def test_on_300500(df:pd.DataFrame,hs300:bool=0,zz500:bool=0,zz1000:bool=0,iplot:bool=1)->pd.Series:
"""对因子在指数成分股内进行多空和多头测试
Parameters
Expand All @@ -4227,33 +4248,35 @@ def test_on_300500(df:pd.DataFrame,hs300:bool=0,zz500:bool=0,zz1000:bool=0,zz200
在中证500成分股内测试, by default 0
zz1000 : bool, optional
在中证1000成分股内测试, by default 0
zz2000 : bool, optional
在国证2000成分股内测试, by default 0
iplot : bol,optional
多空回测的时候,是否使用cufflinks绘画
Returns
-------
pd.Series
多头组在该指数上的超额收益序列
"""
fi300=daily_factor_on300500(df,hs300=hs300,zz500=zz500,zz1000=zz1000,zz2000=zz2000)
shen=pure_moonnight(fi300)
fi300=daily_factor_on300500(df,hs300=hs300,zz500=zz500,zz1000=zz1000)
shen=pure_moonnight(fi300,iplot=iplot)
if shen.shen.group_net_values.group1.iloc[-1]>shen.shen.group_net_values.group10.iloc[-1]:
print(make_relative_comments(shen.shen.group_rets.group1,hs300=hs300,zz500=zz500,zz1000=zz1000,zz2000=zz2000))
abrets=make_relative_comments_plot(shen.shen.group_rets.group1,hs300=hs300,zz500=zz500,zz1000=zz1000,zz2000=zz2000)
print(make_relative_comments(shen.shen.group_rets.group1,hs300=hs300,zz500=zz500,zz1000=zz1000))
abrets=make_relative_comments_plot(shen.shen.group_rets.group1,hs300=hs300,zz500=zz500,zz1000=zz1000)
return abrets
else:
print(make_relative_comments(shen.shen.group_rets.group10,hs300=hs300,zz500=zz500,zz1000=zz1000,zz2000=zz2000))
abrets=make_relative_comments_plot(shen.shen.group_rets.group10,hs300=hs300,zz500=zz500,zz1000=zz1000,zz2000=zz2000)
print(make_relative_comments(shen.shen.group_rets.group10,hs300=hs300,zz500=zz500,zz1000=zz1000))
abrets=make_relative_comments_plot(shen.shen.group_rets.group10,hs300=hs300,zz500=zz500,zz1000=zz1000)
return abrets


def test_on_index_four(df:pd.DataFrame,gz2000:bool=0)->pd.DataFrame:
def test_on_index_four(df:pd.DataFrame,iplot:bool=1,gz2000:bool=0)->pd.DataFrame:
"""对因子同时在沪深300、中证500、中证1000、国证2000这4个指数成分股内进行多空和多头超额测试
Parameters
----------
df : pd.DataFrame
因子值,index为时间,columns为股票代码
iplot : bol,optional
多空回测的时候,是否使用cufflinks绘画
gz2000 : bool, optional
是否进行国证2000上的测试, by default 0
Expand All @@ -4262,11 +4285,11 @@ def test_on_index_four(df:pd.DataFrame,gz2000:bool=0)->pd.DataFrame:
pd.DataFrame
多头组在各个指数上的超额收益序列
"""
abrets300=test_on_300500(df,hs300=1).to_frame('沪深300')
abrets500=test_on_300500(df,zz500=1).to_frame('中证500')
abrets1000=test_on_300500(df,zz1000=1).to_frame('中证1000')
abrets300=test_on_300500(df,hs300=1,iplot=iplot).to_frame('沪深300')
abrets500=test_on_300500(df,zz500=1,iplot=iplot).to_frame('中证500')
abrets1000=test_on_300500(df,zz1000=1,iplot=iplot).to_frame('中证1000')
if gz2000:
abrets2000=test_on_300500(df,gz2000=1).to_frame('国证2000')
abrets2000=test_on_300500(df,gz2000=1,iplot=iplot).to_frame('国证2000')
abrs=pd.concat([abrets300,abrets500,abrets1000,abrets2000],axis=1)
else:
abrs=pd.concat([abrets300,abrets500,abrets1000],axis=1)
Expand Down
5 changes: 5 additions & 0 deletions 更新日志/version3.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
## 更新日志🗓 — v3

* v3.4.1 — 2022.10.26
> 1. 给func_two_daily、corr_two_daily、cov_two_daily增加了history参数,用于将计算出的结果记录在本地
> 1. 给show_corrs、show_corrs_with_old函数增加了method参数,可以修改求相关系数的方式
> 1. 暂时删去了test_on_300500的国证2000的参数
> 1. 给test_on_300500和test_on_index_four新增了iplot参数,决定是否使用cufflinks画图
* v3.4.0 — 2022.10.25
> 1. 修复了拼接多个dataframe的函数merge_many中的bug
> 1. 修复了导入process模块时的bug
Expand Down

0 comments on commit 6ea3846

Please sign in to comment.