-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpagesal125.pl
100 lines (84 loc) · 1.88 KB
/
pagesal125.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
########################
# PageSAL125.pl
########################
#
# This script reads the output files from pdftotext (0x0C breaks pages)
# and ouputs page number, top line, and info within the page.
# The script will also output the tab delimited file based on the
# data.
#
# Top line example:
# 26 PUBLIC LAWS-CH. 42-MAR. 30, 1949 [63 STAT.
# (The text files use left and right folios - even page numbers on left).
$infile=shift;
$outfile1=shift;
$outfile2=shift;
if (!$outfile2 || !(-e $infile))
{
print "Syntax: PageSal.pl infile outfile\n";
print "This script reads the text files from pdftotext and outputs\n";
print "two files. (1) page number, top line, contents and \n";
print "(2) the tab-delimited control file needed to break SAL files\n";
exit;
}
open(IN,$infile);
$outfile1=">".$outfile1;
open(OUT1,$outfile1);
$outfile2=">".$outfile2;
open(OUT2,$outfile2);
$page=1;
$plnum=1;
$linenum=0;
$v=125;
$lastplnum=0;
$statpage=3;
print OUT2 "<PDFSAL>\n";
print OUT2 "<page number=\"1\">\n";
print OUT1 "vol=125\n";
print OUT1 "statpage1=-2\n";
while (<IN>)
{
if ($linenum <5)
{
# print OUT2 $_;
}
if (m|\x0C|)
{
$page++;
if ($plnum == ($lastplnum))
{
print OUT1 "publaw-".$plnum."\t".$statpage."\n";
$lastplnum=$plnum+1;
}
print OUT2 "</page>\n";
print OUT2 "<page number=\"".$page."\">\n";
$linenum=0;
}
else
{
$linenum++;
}
if (m|$v STAT. (\d+)|)
{
$statpage=$1;
print OUT2 " <statpage>".$statpage."</statpage>\n";
}
if (m|PUBLIC LAW (\d+)–(\d+)|)
{
$plyear=$1;
$plnum=$2;
print OUT2 " <publaw>".$plyear."-".$plnum."</publaw>\n";
if ($plnum == ($lastplnum+1))
{
print OUT1 "publaw-".$plnum."\t".$statpage."\n";
$lastplnum=$plnum+1;
}
}
# if (m|PUBLIC LAW (\d+)–(\d+)—([^ ]+) ([^ ]+) ([^ ]+)|)
NextLine:
}
close(IN);
close(OUT1);
print OUT2 "</page>\n";
print OUT2 "</PDFSAL>";
close(OUT2);