-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathcsvdiff.py
executable file
·63 lines (40 loc) · 2.37 KB
/
csvdiff.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
#!/usr/bin/env python3
import pandas as pd
import sys
uda_csv = "/Users/ar46/gfdl-aws-analysis/csvs/intake_uda.csv"
s3_csv = "/Users/ar46/gfdl-aws-analysis/csvs/esgf-world.csv"
df_uda = pd.read_csv(uda_csv, dtype='unicode')
df_s3 = pd.read_csv(s3_csv, dtype='unicode')
print("There are ",len(df_uda), "records in UDA")
print("There are ",len(df_s3), "records in S3 esgf-world")
#diff df_uda and df_uda_unique to get the duplicates
df_uda['path'] = df_uda.apply(lambda row: '/'.join((row.path.split("/uda/CMIP6")[1].split("/"))),axis=1)
df_s3['path'] = df_s3.apply(lambda row: '/'.join((row.path.split("s3://esgf-world/CMIP6")[1].split("/"))),axis=1)
uda_is_gfdl = df_uda['institute'] == "NOAA-GFDL" #.str.contains(r'GFDL(?!$)')
s3_is_gfdl = df_s3['institute'] == "NOAA-GFDL" #.str.contains(r'GFDL(?!$)')
df_s3_new = df_s3[s3_is_gfdl]
df_uda_new = df_uda[uda_is_gfdl]
#drop duplicate versions from df_uda_new only
print("All versions of GFDL datasets in UDA",len(df_uda_new))
df_uda_new = df_uda_new.sort_values(by=['version']).drop_duplicates(subset = ["project","institute","model","experiment_id","mip_table","ensemble_member","grid_label","variable","temporal subset"],keep='last')
print("Latest versions of GFDL datasets in UDA: ",len(df_uda_new))
df_uda_new.to_csv("GFDL_latest_UDA.csv", encoding='utf-8', index=False)
#ignore path as well
df_s3_new = df_s3_new.drop(columns=['modeling_realm','frequency'],axis = 1 )
df_uda_new = df_uda_new.drop(columns=['modeling_realm','frequency'],axis = 1 )
print("There are ",len(df_uda_new), "GFDL files in UDA")
print("There are ",len(df_s3_new), "GFDL objects in S3")
cdf_uda_new = df_uda_new.sort_values(by=['path'])
cdf_s3_new = df_s3_new.sort_values(by=['path'])
##
df = df_uda_new.merge(df_s3_new, how = 'outer' ,indicator=True).loc[lambda x : x['_merge']=='left_only']
print("There are ",len(df), "records in UDA ", "that are not in S3")
df2 = df_uda_new.merge(df_s3_new, how = 'outer' ,indicator=True).loc[lambda x : x['_merge']=='right_only']
print("There are ",len(df2), "records in S3 ", "that are not in UDA")
df_common = df_uda_new.merge(df_s3_new, how = 'inner' ,indicator=False)
print("There are ", len(df_common), "GFDL records common in both UDA and S3")
#TODOput path back to present
#to be deleted
df2.to_csv("inS3_notUDA_2.csv", encoding='utf-8', index=False)
#to be copied
df.to_csv("inUDA_notS3_2.csv", encoding='utf-8', index=False)