From 6ea3846c5ca501be4aa4c3821deb057440b96ee0 Mon Sep 17 00:00:00 2001
From: chenzongwei <63836858+chen-001@users.noreply.github.com>
Date: Wed, 26 Oct 2022 19:59:47 +0800
Subject: [PATCH] v3.4.1

---
 pure_ocean_breeze/__init__.py                 |   4 +-
 pure_ocean_breeze/data/tools.py               | 107 +++++++++++++++---
 pure_ocean_breeze/labor/process.py            |  71 ++++++++----
 .../version3.md"                              |   5 +
 4 files changed, 145 insertions(+), 42 deletions(-)

diff --git a/pure_ocean_breeze/__init__.py b/pure_ocean_breeze/__init__.py
index 4bff2bb..b47ca78 100644
--- a/pure_ocean_breeze/__init__.py
+++ b/pure_ocean_breeze/__init__.py
@@ -2,8 +2,8 @@
 一个量化多因子研究的框架，包含数据、回测、因子加工等方面的功能
 """
 
-__updated__ = "2022-10-25 07:45:37"
-__version__ = "3.4.0"
+__updated__ = "2022-10-26 10:56:42"
+__version__ = "3.4.1"
 __author__ = "chenzongwei"
 __author_email__ = "winterwinter999@163.com"
 __url__ = "https://github.com/chen-001/pure_ocean_breeze"
diff --git a/pure_ocean_breeze/data/tools.py b/pure_ocean_breeze/data/tools.py
index f21713a..846f7f9 100644
--- a/pure_ocean_breeze/data/tools.py
+++ b/pure_ocean_breeze/data/tools.py
@@ -2,8 +2,9 @@
 针对一些不常见的文件格式，读取数据文件的一些工具函数，以及其他数据工具
 """
 
-__updated__ = "2022-10-24 20:33:41"
+__updated__ = "2022-10-26 18:51:43"
 
+import os
 import h5py
 import pandas as pd
 import tqdm
@@ -13,6 +14,7 @@
 import numpy_ext as npext
 import scipy.stats as ss
 from functools import reduce, partial
+from loguru import logger
 from typing import Callable, Union
 
 try:
@@ -21,6 +23,7 @@
     rqdatac.init()
 except Exception:
     print("暂时未连接米筐")
+from pure_ocean_breeze.state.homeplace import HomePlace
 
 
 def read_h5(path: str) -> dict:
@@ -377,7 +380,11 @@ def merge_many(dfs: list[pd.DataFrame], names: list = None) -> pd.DataFrame:
 
 
 def corr_two_daily(
-    df1: pd.DataFrame, df2: pd.DataFrame, rolling_window: int = 20, n_jobs: int = 6
+    df1: pd.DataFrame,
+    df2: pd.DataFrame,
+    history: str = None,
+    rolling_window: int = 20,
+    n_jobs: int = 6,
 ) -> pd.DataFrame:
     """求两个因子，在相同股票上，时序上滚动窗口下的相关系数
 
@@ -387,6 +394,8 @@ def corr_two_daily(
         第一个因子，index为时间，columns为股票代码
     df2 : pd.DataFrame
         第二个因子，index为时间，columns为股票代码
+    history : str, optional
+        从某处读取计算好的历史文件
     rolling_window : int, optional
         滚动窗口, by default 20
     n_jobs : int, optional
@@ -402,12 +411,21 @@ def corr_in(a, b, c):
         return c.iloc[-1], np.corrcoef(a, b)[0, 1]
 
     return func_two_daily(
-        df1=df1, df2=df2, func=corr_in, rolling_window=rolling_window, n_jobs=n_jobs
+        df1=df1,
+        df2=df2,
+        func=corr_in,
+        history=history,
+        rolling_window=rolling_window,
+        n_jobs=n_jobs,
     )
 
 
 def cov_two_daily(
-    df1: pd.DataFrame, df2: pd.DataFrame, rolling_window: int = 20, n_jobs: int = 6
+    df1: pd.DataFrame,
+    df2: pd.DataFrame,
+    history: str = None,
+    rolling_window: int = 20,
+    n_jobs: int = 6,
 ) -> pd.DataFrame:
     """求两个因子，在相同股票上，时序上滚动窗口下的协方差
 
@@ -417,6 +435,8 @@ def cov_two_daily(
         第一个因子，index为时间，columns为股票代码
     df2 : pd.DataFrame
         第二个因子，index为时间，columns为股票代码
+    history : str, optional
+        从某处读取计算好的历史文件
     rolling_window : int, optional
         滚动窗口, by default 20
     n_jobs : int, optional
@@ -432,7 +452,12 @@ def cov_in(a, b, c):
         return c.iloc[-1], np.cov(a, b)[0, 1]
 
     return func_two_daily(
-        df1=df1, df2=df2, func=cov_in, rolling_window=rolling_window, n_jobs=n_jobs
+        df1=df1,
+        df2=df2,
+        func=cov_in,
+        history=history,
+        rolling_window=rolling_window,
+        n_jobs=n_jobs,
     )
 
 
@@ -440,6 +465,7 @@ def func_two_daily(
     df1: pd.DataFrame,
     df2: pd.DataFrame,
     func: Callable,
+    history: str = None,
     rolling_window: int = 20,
     n_jobs: int = 6,
 ) -> pd.DataFrame:
@@ -453,6 +479,8 @@ def func_two_daily(
         第二个因子，index为时间，columns为股票代码
     func : Callable
         要对两列数进行操作的函数
+    history : str, optional
+        从某处读取计算好的历史文件
     rolling_window : int, optional
         滚动窗口, by default 20
     n_jobs : int, optional
@@ -474,17 +502,64 @@ def func_rolling(df):
             )
             return df
 
-    twins = merge_many([df1, df2])
-    tqdm.tqdm.pandas()
-    corrs = twins.groupby(["code"]).progress_apply(func_rolling)
-    cor = []
-    for i in range(len(corrs)):
-        df = pd.DataFrame(corrs.iloc[i]).dropna().assign(code=corrs.index[i])
-        cor.append(df)
-    cors = pd.concat(cor)
-    cors.columns = ["date", "corr", "code"]
-    cors = cors.pivot(index="date", columns="code", values="corr")
-    return cors
+    homeplace = HomePlace()
+    if history is not None:
+        if os.path.exists(homeplace.update_data_file + history):
+            old = pd.read_feather(homeplace.update_data_file + history)
+            old = old.set_index(list(old.columns)[0])
+            new_end = min(df1.index.max(), df2.index.max())
+            if new_end > old.index.max():
+                old_end = datetime.datetime.strftime(old.index.max(), "%Y%m%d")
+                logger.info(f"上次更新到了{old_end}")
+                df1a = df1[df1.index <= old.index.max()].tail(rolling_window - 1)
+                df1b = df1[df1.index > old.index.max()]
+                df1 = pd.concat([df1a, df1b])
+                df2a = df2[df2.index <= old.index.max()].tail(rolling_window - 1)
+                df2b = df2[df2.index > old.index.max()]
+                df2 = pd.concat([df2a, df2b])
+                twins = merge_many([df1, df2])
+                tqdm.tqdm.pandas()
+                corrs = twins.groupby(["code"]).progress_apply(func_rolling)
+                cor = []
+                for i in range(len(corrs)):
+                    df = (
+                        pd.DataFrame(corrs.iloc[i]).dropna().assign(code=corrs.index[i])
+                    )
+                    cor.append(df)
+                cors = pd.concat(cor)
+                cors.columns = ["date", "corr", "code"]
+                cors = cors.pivot(index="date", columns="code", values="corr")
+                if history is not None:
+                    if os.path.exists(homeplace.update_data_file + history):
+                        cors = pd.concat([old, cors])
+                    cors = drop_duplicates_index(cors)
+                    cors.reset_index().to_feather(homeplace.update_data_file + history)
+                    new_end = datetime.datetime.strftime(cors.index.max(), "%Y%m%d")
+                    logger.info(f"已经更新至{new_end}")
+                return cors
+            else:
+                logger.info(f"已经是最新的了")
+                return old
+        else:
+            logger.info("第一次计算，请耐心等待，计算完成后将存储")
+            twins = merge_many([df1, df2])
+            tqdm.tqdm.pandas()
+            corrs = twins.groupby(["code"]).progress_apply(func_rolling)
+            cor = []
+            for i in range(len(corrs)):
+                df = pd.DataFrame(corrs.iloc[i]).dropna().assign(code=corrs.index[i])
+                cor.append(df)
+            cors = pd.concat(cor)
+            cors.columns = ["date", "corr", "code"]
+            cors = cors.pivot(index="date", columns="code", values="corr")
+            if history is not None:
+                if os.path.exists(homeplace.update_data_file + history):
+                    cors = pd.concat([old, cors])
+                cors = drop_duplicates_index(cors)
+                cors.reset_index().to_feather(homeplace.update_data_file + history)
+                new_end = datetime.datetime.strftime(cors.index.max(), "%Y%m%d")
+                logger.info(f"已经更新至{new_end}")
+            return cors
 
 
 def drop_duplicates_index(new: pd.DataFrame) -> pd.DataFrame:
diff --git a/pure_ocean_breeze/labor/process.py b/pure_ocean_breeze/labor/process.py
index 0f01e27..93484c1 100644
--- a/pure_ocean_breeze/labor/process.py
+++ b/pure_ocean_breeze/labor/process.py
@@ -1,4 +1,4 @@
-__updated__ = "2022-10-11 12:35:40"
+__updated__ = "2022-10-26 18:52:42"
 
 import warnings
 
@@ -999,6 +999,7 @@ def show_corrs(
     factor_names: list[str] = None,
     print_bool: bool = True,
     show_percent: bool = True,
+    method: str='spearman',
 ) -> pd.DataFrame:
     """展示很多因子两两之间的截面相关性
 
@@ -1012,6 +1013,8 @@ def show_corrs(
         是否打印出两两之间相关系数的表格, by default True
     show_percent : bool, optional
         是否以百分数的形式展示, by default True
+    method : str, optional
+        计算相关系数的方法, by default "spearman"
 
     Returns
     -------
@@ -1022,7 +1025,7 @@ def show_corrs(
     for i in range(len(factors)):
         main_i = factors[i]
         follows = factors[i + 1 :]
-        corr = [show_corr(main_i, i, plt_plot=False) for i in follows]
+        corr = [show_corr(main_i, i, plt_plot=False,method=method) for i in follows]
         corr = [np.nan] * (i + 1) + corr
         corrs.append(corr)
     if factor_names is None:
@@ -1062,12 +1065,27 @@ def de_cross(
     return (y - xs)()
 
 
-def show_corrs_with_old(df):
-    df0=df.resample('M').last()
-    if df.shape[0]/df0.shape[0]>2:
-        daily=1
-    else:
-        daily=0
+def show_corrs_with_old(df:pd.DataFrame=None,method:str='spearman')->pd.DataFrame:
+    """计算新因子和已有因子的相关系数
+
+    Parameters
+    ----------
+    df : pd.DataFrame, optional
+        新因子, by default None
+    method : str, optional
+        求相关系数的方法, by default 'spearman'
+
+    Returns
+    -------
+    pd.DataFrame
+        相关系数矩阵
+    """
+    if df is not None:    
+        df0=df.resample('M').last()
+        if df.shape[0]/df0.shape[0]>2:
+            daily=1
+        else:
+            daily=0
     olds=[]
     for i in range(1,100):
         try:
@@ -1078,8 +1096,11 @@ def show_corrs_with_old(df):
             olds.append(old)
         except Exception:
             break
-    olds=[df]+olds
-    corrs=show_corrs(olds,['new']+[f'old{i}' for i in range(1,len(olds))])
+    if df is not None:
+        olds=[df]+olds
+        corrs=show_corrs(olds,['new']+[f'old{i}' for i in range(1,len(olds))],method=method)
+    else:
+        corrs=show_corrs(olds,[f'old{i}' for i in range(1,len(olds))],method=method)
     return corrs
 
 
@@ -4214,7 +4235,7 @@ def ols_in(self, df):
             ...
 
 
-def test_on_300500(df:pd.DataFrame,hs300:bool=0,zz500:bool=0,zz1000:bool=0,zz2000:bool=0)->pd.Series:
+def test_on_300500(df:pd.DataFrame,hs300:bool=0,zz500:bool=0,zz1000:bool=0,iplot:bool=1)->pd.Series:
     """对因子在指数成分股内进行多空和多头测试
 
     Parameters
@@ -4227,33 +4248,35 @@ def test_on_300500(df:pd.DataFrame,hs300:bool=0,zz500:bool=0,zz1000:bool=0,zz200
         在中证500成分股内测试, by default 0
     zz1000 : bool, optional
         在中证1000成分股内测试, by default 0
-    zz2000 : bool, optional
-        在国证2000成分股内测试, by default 0
+    iplot : bol,optional
+        多空回测的时候，是否使用cufflinks绘画
 
     Returns
     -------
     pd.Series
         多头组在该指数上的超额收益序列
     """    
-    fi300=daily_factor_on300500(df,hs300=hs300,zz500=zz500,zz1000=zz1000,zz2000=zz2000)
-    shen=pure_moonnight(fi300)
+    fi300=daily_factor_on300500(df,hs300=hs300,zz500=zz500,zz1000=zz1000)
+    shen=pure_moonnight(fi300,iplot=iplot)
     if shen.shen.group_net_values.group1.iloc[-1]>shen.shen.group_net_values.group10.iloc[-1]:
-        print(make_relative_comments(shen.shen.group_rets.group1,hs300=hs300,zz500=zz500,zz1000=zz1000,zz2000=zz2000))
-        abrets=make_relative_comments_plot(shen.shen.group_rets.group1,hs300=hs300,zz500=zz500,zz1000=zz1000,zz2000=zz2000)
+        print(make_relative_comments(shen.shen.group_rets.group1,hs300=hs300,zz500=zz500,zz1000=zz1000))
+        abrets=make_relative_comments_plot(shen.shen.group_rets.group1,hs300=hs300,zz500=zz500,zz1000=zz1000)
         return abrets
     else:
-        print(make_relative_comments(shen.shen.group_rets.group10,hs300=hs300,zz500=zz500,zz1000=zz1000,zz2000=zz2000))
-        abrets=make_relative_comments_plot(shen.shen.group_rets.group10,hs300=hs300,zz500=zz500,zz1000=zz1000,zz2000=zz2000)
+        print(make_relative_comments(shen.shen.group_rets.group10,hs300=hs300,zz500=zz500,zz1000=zz1000))
+        abrets=make_relative_comments_plot(shen.shen.group_rets.group10,hs300=hs300,zz500=zz500,zz1000=zz1000)
         return abrets
     
     
-def test_on_index_four(df:pd.DataFrame,gz2000:bool=0)->pd.DataFrame:
+def test_on_index_four(df:pd.DataFrame,iplot:bool=1,gz2000:bool=0)->pd.DataFrame:
     """对因子同时在沪深300、中证500、中证1000、国证2000这4个指数成分股内进行多空和多头超额测试
 
     Parameters
     ----------
     df : pd.DataFrame
         因子值，index为时间，columns为股票代码
+    iplot : bol,optional
+        多空回测的时候，是否使用cufflinks绘画
     gz2000 : bool, optional
         是否进行国证2000上的测试, by default 0
 
@@ -4262,11 +4285,11 @@ def test_on_index_four(df:pd.DataFrame,gz2000:bool=0)->pd.DataFrame:
     pd.DataFrame
         多头组在各个指数上的超额收益序列
     """    
-    abrets300=test_on_300500(df,hs300=1).to_frame('沪深300')
-    abrets500=test_on_300500(df,zz500=1).to_frame('中证500')
-    abrets1000=test_on_300500(df,zz1000=1).to_frame('中证1000')
+    abrets300=test_on_300500(df,hs300=1,iplot=iplot).to_frame('沪深300')
+    abrets500=test_on_300500(df,zz500=1,iplot=iplot).to_frame('中证500')
+    abrets1000=test_on_300500(df,zz1000=1,iplot=iplot).to_frame('中证1000')
     if gz2000:
-        abrets2000=test_on_300500(df,gz2000=1).to_frame('国证2000')
+        abrets2000=test_on_300500(df,gz2000=1,iplot=iplot).to_frame('国证2000')
         abrs=pd.concat([abrets300,abrets500,abrets1000,abrets2000],axis=1)
     else:
         abrs=pd.concat([abrets300,abrets500,abrets1000],axis=1)
diff --git "a/\346\233\264\346\226\260\346\227\245\345\277\227/version3.md" "b/\346\233\264\346\226\260\346\227\245\345\277\227/version3.md"
index 5cd2057..3a06ac2 100644
--- "a/\346\233\264\346\226\260\346\227\245\345\277\227/version3.md"
+++ "b/\346\233\264\346\226\260\346\227\245\345\277\227/version3.md"
@@ -1,5 +1,10 @@
 ## 更新日志🗓 — v3
 
+* v3.4.1 — 2022.10.26
+> 1. 给func_two_daily、corr_two_daily、cov_two_daily增加了history参数，用于将计算出的结果记录在本地
+> 1. 给show_corrs、show_corrs_with_old函数增加了method参数，可以修改求相关系数的方式
+> 1. 暂时删去了test_on_300500的国证2000的参数
+> 1. 给test_on_300500和test_on_index_four新增了iplot参数，决定是否使用cufflinks画图
 * v3.4.0 — 2022.10.25
 > 1. 修复了拼接多个dataframe的函数merge_many中的bug
 > 1. 修复了导入process模块时的bug