forked from ssharoff/biberpy
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbiber-variation.py
executable file
·53 lines (44 loc) · 1.83 KB
/
biber-variation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Studying Biber exploration with respect to genres
It takes one parameter:
1. a file with Biber-like dimensions (with a header, IDs and the Cat column in the end
paste l24.dat l24.cats | numlines | sed 's/^/ID-/'
ID-1 A01.pastVerbs A03.presVerbs Cat
ID-2 0.33333 0.75000 A1
ID-3 0.62500 1.04167 A1
It outputs:
1. to STDOUT: Genre, Biber-dim, mean and median for the Genre
2. to STDERR: the min-max ranges for each Biber-dim over the genres and the ratio of its median range to its median (how much it varies across the genres)
"""
import sys
import os
import re
import numpy as np
import pandas as pd
def range(l):
return np.max(l)-np.min(l)
def split_genres(df1,catset):
"""Create a subset of labelled data from a dataframe."""
subsets={}
for cat in catset:
subsets[cat] = df1[(df1["Cat"] == cat)]
print(f'{cat} -> {len(subsets[cat])} examples', file=sys.stderr)
return subsets
fname=sys.argv[1]
df1 = pd.read_csv(open(fname), sep='\t', index_col='ID-1')
print(f'Read {fname} {len(df1)} rows', file=sys.stderr)
catlist ='A1 A4 A7 A8 A9 A11 A12 A14 A16 A17'.split()
subsets = split_genres(df1,catlist)
print('col\t mean\t median\t range_mean\t range_median\t range_rate', file=sys.stderr)
for col in df1.columns.tolist()[:-1]:
mean_values=[]
median_values=[]
for cat in catlist:
curvalues=subsets[cat][col]
means = np.mean(curvalues)
mean_values.append(means)
medians = np.median(curvalues)
median_values.append(medians)
print(f'{cat[1:]}\t{col}\t{means*100:.4f}\t{medians*100:.4f}')
print(f'{col}\t{np.mean(df1[col])*100:.4f}\t{np.median(df1[col])*100:.4f}\t{range(mean_values)*100:.4f}\t{range(median_values)*100:.4f}\t{range(median_values)/(np.median(median_values)+1e-10):.4f}', file=sys.stderr)