forked from matteoacrossi/texprlcount
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtexprlcount.pl
executable file
·239 lines (189 loc) · 6.74 KB
/
texprlcount.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
#!/usr/bin/perl
#
# This script estimates the word count of a .tex file according to the PRL
# guidelines for length, available at
#
# https://journals.aps.org/authors/length-guide
#
# The TeXcount is used for text, tables and equations, while the aspect ratio of
# figures is obtained from the latex .log file. If the file is not present,
# an error will be raised
use strict;
use warnings;
use POSIX;
use Math::Round;
use List::MoreUtils 'first_index';
use File::Temp qw/ tempdir /;
use File::Basename;
if ($#ARGV < 0) {
print "Usage: prllength.pl filename\n";
exit;
}
my $filename = $ARGV[0];
(my $name, my $path, my $suffix) = fileparse($filename, ".tex");
chdir $path;
if (!-e "$name.tex") {
print "The file $name.tex doesn't exist\n";
exit;
}
#We open the tex file and the log file
open(my $texfileh,"<$name.tex") || die "File $path/$name.tex not found.";
my $logfileh;
unless(open($logfileh,"<$name.log")) {
print "$name.log file not found, compiling the texfile...\n";
my $tmpdir = tempdir( CLEANUP => 1 );
`pdflatex -output-directory=$tmpdir $name`;
open($logfileh,"<$tmpdir/$name.log") || die "File $name.log not found. There were problems during the compilation.";
}
local $/; # Allows for the whole file to be read into a string (otherwise,
# it would be line-wise)
my $logfile = <$logfileh>;
my $texfile = <$texfileh>;
close $logfileh;
close $texfileh;
# We strip comments from the tex file
$texfile =~ s/[^\\]%[^\n]*//g;
# We count the number of characters in the abstract
my $abstract;
($abstract) = $texfile =~ /\\begin\{abstract\}(.*?)\\end\{abstract\}/s;
$abstract =~ s/\R//g;
$abstract = length($abstract);
my $totalcount = 0; # Total word count
# We use texcount for evaluating the total word count given by text, captions,
# headers, inline equations (1 eq = 1 word) and display equation (1 eq = 16
# words)
# We create a temporary rule file to tell texcount to exclude abstract and acknowledgments from the count
open(my $tmp, '>', 'tcrules');
print $tmp "\%group abstract 0 0\n\%group acknowledgments 0 0";
close $tmp;
my $texcount = `texcount $name.tex -opt=tcrules -utf8 -sum=1,1,1,0,0,1,0`;
unlink 'tcrules';
print "\n";
print "Words in text, headers and equations\n";
print "------------------------------------\n";
print "$texcount";
($totalcount) = $texcount =~ /Sum\scount:\s(\d+)/;
print "Abstract length: $abstract characters\n\n";
# DISPLAYED MATH
################
#
# We now address displayed (multiline) equations. First, we match the environments that can contain multiline equations: align, split, eqnarray etc
my (@aligns) = $texfile =~ /\\begin\{(equation|align\*?|eqnarray|gather)\}(.*?)\\end\{\1\}/sg;
my $mathlinecount;
for (my $i = 1; $i <= $#aligns; $i = $i + 2) {
$mathlinecount += () = $aligns[$i] =~ /\\\\/g;
$mathlinecount++;
}
#Now we check for $$ .. $$
(@aligns) = $texfile =~ /\$\$(.*?)\$\$/sg;
foreach (@aligns) {
$mathlinecount++;
}
#And for \[ \]
(@aligns) = $texfile =~ /\\\[(.*?)\\\]/sg;
foreach (@aligns) {
$mathlinecount++;
}
$totalcount += 16*$mathlinecount;
print "Number of displayed math lines: $mathlinecount\n\n";
# TABLES
##########
my (@tables) = $texfile =~ /\\begin\{tabular\}(.*?)\\end\{tabular\}/sg;
my $tablecount = 0;
my $tablelinecount = 0;
foreach (@tables) {
$tablecount++;
$tablelinecount += () = $_ =~ /\\\\/g;
$tablelinecount += () = $_ =~ /\\tabularnewline/g;
$tablelinecount -= () = $_ =~ /\\hline[\s]*$/g;
$tablelinecount -= () = $_ =~ /\\\\[\s]*$/g;
$tablelinecount++;
}
print "Number of tables: $tablecount\n";
print "Table rows: $tablelinecount\n\n";
$totalcount += 13*$tablecount + 6.5 * $tablelinecount;
# IMAGES
##########
#
# We now address the image estimated word count. PRL length guide suggests the
# formula
#
# 150 150 * height
# (word count) = -------------- + 20 = ------------ + 20
# aspect ratio width
#
# where aspect ratio is width / height.
#
# We use the pdflatex log file for this task. In the log file, for each
# included graphics an output similar to the following appears
#
# > <filename.pdf, id=116, 199.74625pt x 108.405pt>
# > File: filename.pdf Graphic file (type pdf)
# >
# > <use filename.pdf>
# > Package pdftex.def Info: filename.pdf used on input line 313.
# > (pdftex.def) Requested size: 221.3985pt x 120.16223pt.
#
print "Images\n";
print "------\n";
my $imageswordcount = 0;
my @images;
my @sizes;
# Extract the names of images from the log file
@images = $logfile =~ /\<use (.*?)\>/g;
if ($#images >= 0) {
@sizes= $logfile =~ /Requested size:\s([\d\.]+)pt\sx\s([\d\.]+)pt/g;
my @ars;
# for (my $i=0; $i <= $#images; $i++) {
# my $tmp = nearest(0.001, $sizes[2*$i] / $sizes[2*$i+1]);
# push(@ars,$tmp);
# }
# Now look in the tex file to check wether they are in a single-column or in a
# double-column figure environment
# Here, we assume that the order in the log file is the same as the order in the environments
my @figenvtype = $texfile =~ /\\begin\{figure(\*?)\}/g;
my @figenv = $texfile =~ /\\begin\{figure(.*?)\\end\{figure/gs;
my @lengths;
my $ml = max_length(@images);
printf "%-${ml}s Aspect ratio Est. word count Two-column\n", "File name";
printf "%${ml}.${ml}s-----------------------------------------------\n", "---------------------------------------------------";
for(my $i=0; $i <= $#figenv; $i++) {
my @img_in_env = $figenv[$i] =~ /\\includegraphics(?:\[[^\]]*\])?\{(.*?)\}/gs;
printf "Figure %s\n", $i + 1;
foreach my $imgname (@img_in_env) {
my $index = first_index { /$imgname/ } @images;
my $tmp = nearest(0.001, $sizes[2*$index] / $sizes[2*$index+1]);
push(@ars,$tmp);
if ($figenvtype[$i] eq '') { #The environment is plain \begin{figure}
push(@lengths,ceil(150 / $tmp + 20));
}
elsif ($figenvtype[$i] eq '*') { # The environment is two column \begin{figure*}
push(@lengths,ceil(300 / (0.5*$tmp) + 40));
}
else {
die "Error while processing the figure environments";
}
printf " %-${ml}s %12.2f %15d %s\n", $images[$index],$ars[$index],$lengths[$index],$figenvtype[$i];
}
}
for ( @lengths ) {
$imageswordcount += $_;
}
print "\nTotal word count for images: $imageswordcount\n\n";
}
else {
print "The file doesn't contain images.\n\n";
}
$totalcount += $imageswordcount;
print "Total word count (words + equations + images)\n$totalcount\n";
sub max_length {
my $max = -1;
my $max_ref;
for (@_) {
if (length > $max) { # no temp variable, length() twice is faster
$max = length;
$max_ref = \$_; # avoid any copying
}
}
$max
}