-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex.php
405 lines (371 loc) · 12.5 KB
/
index.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
<?php // encoding="UTF-8"
/**
Diverses statistiques pour renseigner l'étendue d'un fichier XML
© 2012, <a href="http://algone.net/">Algone</a>,
<a href="http://www.cecill.info/licences/Licence_CeCILL-C_V1-fr.html">licence CeCILL-C</a>
(LGPL compatible droit français)
<ul>
<li>2012 [FG] <a onmouseover="this.href='mailto'+'\x3A'+'frederic.glorieux'+'\x40'+'algone.net'">Frédéric Glorieux</a></li>
</ul>
*/
// Start session now, in hope that file will be included before output
session_cache_limiter(false); // important to get correct cache headers
session_start();
// Put some conf
if(file_exists($file=dirname(__FILE__).'/fr.stop')) xmlstats::$stoplist['Français, mots vides']=realpath($file);
if(file_exists(dirname(__FILE__).'/conf.php')) include_once(dirname(__FILE__).'/conf.php');
include dirname(__FILE__).'/TagList.php';
include dirname(__FILE__).'/FormList.php';
// give time and memory
ini_set("max_execution_time", 30000);
ini_set('memory_limit', -1);
class xmlstats {
/** Set of xml files to propose */
static public $glob=array();
/** Set of stopWords */
static public $stoplist=array();
/** TimeStamp of corpus */
static public $lastModified=0;
/** A cache dir, for corpus or session */
static public $cacheDir;
/** If a corpus is requested */
static private $corpus;
/**
*
*/
static function head() {
}
/**
* html div to use object.
* Allow different output format, especially csv or txt
*/
static function body() {
if (!isset($_SESSION['xml'])) $_SESSION['xml']=array();
// corpus requested, add files from filesystem to session
$corpus=null;
// si pas de dossier cache en conf, prendre tmp
if (!self::$cacheDir) self::$cacheDir=sys_get_temp_dir()."/xmlstats/";
if ( isset($_GET['corpus']) && isset(self::$glob[$_GET['corpus']]) ) {
$corpus=$_REQUEST['corpus'];
// configurer le dossier de cache pour ce corpus
self::$cacheDir.="$corpus/";
if (!file_exists(self::$cacheDir)) mkdir(self::$cacheDir, 0777, true);
chmod (self::$cacheDir, 0777);
// empty the session
self::sessionClean();
foreach(glob(self::$glob[$corpus]) as $file) $_SESSION['xml'][ basename($file)]=$file;
}
// add upload file to session
if (isset($_FILES['xml']) && ($name=$_FILES['xml']['name']) ) {
$file=tempnam(null, "xmlstats_");
if (isset($_SESSION['xml'][$name]) && file_exists($_SESSION['xml'][$name])) unlink($_SESSION['xml'][$name]);
move_uploaded_file($_FILES['xml']['tmp_name'], $file);
$_SESSION['xml'][$name]=$file;
ksort($_SESSION['xml']);
}
// Actions
$xpath=(isset($_REQUEST['xpath'])) ? $_REQUEST['xpath'] : null;
if (isset($_REQUEST['value'])) $mode="value";
else if (isset($_REQUEST['tokenize'])) $mode="tokenize";
else if (isset($_REQUEST['locution'])) $mode="locution";
else if (isset($_REQUEST['nodelist'])) $mode="nodelist";
else $mode="tokenize";
// display forms
$format=(isset($_REQUEST['format'])) ? $_REQUEST['format'] : null;
if ($format!="csv" && $format!="txt") $format="html";
if ($format=='html') {
self::formFile();
self::formDo();
}
// get a last modified date
foreach($_SESSION['xml'] as $name=>$file) {
if (filemtime($file) > self::$lastModified) self::$lastModified=filemtime($file);
}
/*
// regénérer si les fichiers source ont changé
foreach(glob(dirname(__FILE__).'/*.php') as $file) {
if (filemtime($file) > self::$lastModified) self::$lastModified=filemtime($file);
}
*/
// load a stop list
$exclude=null;
if (isset($_REQUEST['stop']) && isset(xmlstats::$stoplist[$_REQUEST['stop']])) {
$file=xmlstats::$stoplist[$_REQUEST['stop']];
$exclude=explode("\n", preg_replace('/#.*/','',file_get_contents($file)));
$exclude=array_flip($exclude);
unset($exclude['']);
}
// choose action
if (isset($_REQUEST['help'])) {
FormList::help();
FormList::test();
TagList::help();
TagList::test();
}
// Taglist with cache for corpus
else if ($corpus && isset($_REQUEST['taglist'])) {
$cacheFile=self::$cacheDir."taglist.$format";
@chmod($cacheFile, 0666);
if (!file_exists($cacheFile) || filemtime($cacheFile) < self::$lastModified || isset($_REQUEST['nocache'])) {
$stats=new TagList($format);
foreach(glob(self::$glob[$corpus]) as $file) {
if ($format == "html") print "\n<!-- ".basename($file)." -->"; // output something for timeout
else header("XmlStats-File:".basename($file));
$stats->parse( $file);
}
ob_start();
$stats->table($format, $corpus);
$contents=ob_get_contents();
ob_end_clean();
file_put_contents($cacheFile, $contents);
}
include $cacheFile;
}
// FormList with cache for known corpus
else if ($corpus && $xpath) {
$stop=(isset($_REQUEST['stop'])) ? $_REQUEST['stop'] : null;
$cacheFile=self::$cacheDir.urlencode($xpath.'_'.$mode.'_'.$stop).".$format";
if (!file_exists($cacheFile) || filemtime($cacheFile) < self::$lastModified || isset($_REQUEST['nocache'])) {
$stats=new FormList($mode, $exclude, $format);
$cache=fopen($cacheFile, "w");
@chmod($cacheFile, 0666);
$contents=null;
foreach(glob(self::$glob[$corpus]) as $file) {
// avoid timeout
if ($format == "html") echo "\n<!-- ",basename($file)," -->";
else if(!$contents) header("XmlStats-File:".basename($file));
ob_start();
$stats->parse( $file, $xpath);
$contents=ob_get_contents();
fwrite($cache, $contents);
ob_end_clean();
echo $contents;
}
ob_start();
$stats->table($format);
$contents=ob_get_contents();
fwrite($cache, $contents);
ob_end_clean();
echo $contents;
fclose($cache);
}
else include $cacheFile;
}
else if (isset($_REQUEST['taglist'])) {
$stats=new TagList($format);
foreach($_SESSION['xml'] as $name=>$file){
if ($format == "html") echo "\n<!-- ",basename($file)," -->";
else header("XmlStats-File:".basename($file));
$stats->parse($file);
}
echo $stats->table($format);
}
else if ($xpath) {
$stats=new FormList($mode, $exclude, $format);
foreach($_SESSION['xml'] as $name=>$file) {
if ($format == "html") echo "\n<!-- ",basename($file)," -->";
else header("XmlStats-File:".basename($file));
$stats->parse($file, $xpath);
}
$stats->table($format);
}
else {
self::welcome();
}
// javascript
if ($format=="html"){
print '<script type="text/javascript" src="Sortable.js">//</script>';
}
else exit;
}
/**
*
*/
static function formDo() {
$xpath= (isset($_REQUEST['xpath'])) ? str_replace('"', '"', $_REQUEST['xpath']) : null;
// déjouer les magic quotes
if (get_magic_quotes_gpc()) $xpath=stripslashes($xpath);
$corpus=(isset($_REQUEST['corpus'])) ? $_REQUEST['corpus'] : '';
echo '
<p/>
<form method="GET" name="taglist">
<input type="hidden" name="corpus" value="' , $corpus , '"/>
<input name="taglist" type="submit" value="Table des balises"/>
</form>
<form method="GET" name="xpath">
<input type="hidden" name="corpus" value="' , $corpus , '"/>
<label>Expression Xpath <input type="text" size="0" name="xpath" value="' , $xpath , '"/></label>';
$stop=(isset($_REQUEST['stop'])) ? $_REQUEST['stop'] : null;
if (count(xmlstats::$stoplist)) {
echo '
<label> – Filtre
<select name="stop">
<option></option>
';
foreach (xmlstats::$stoplist as $name=>$file) {
echo "\n<option";
if ($stop == $name) echo ' selected="selected"';
echo ">$name</option>";
}
echo '
</select>
</label>';
}
print '
<label>Format
<select name="format">
<option/>
<option value="txt">Liste (txt)</option>
<option value="csv">Tableur (csv)</option>
</select>
</label>
<button type="submit" name="tokenize" value="1">Mots fréquents</button>
</form>
<p> </p>
';
/* Option trop lourdes en général
<button type="submit" name="nodelist" value="1" title="">Liste de contenus</button>
<button type="submit" name="value" value="1" title="Table triée des valeurs">Table de valeurs</button>
<button type="submit" name="tokenize" value="1">Table de mots</button>
<button type="submit" name="locution" value="1">Table de locutions</button>
*/
}
/**
* The form to manage filelists
*/
static function formFile() {
print '
<form enctype="multipart/form-data" method="POST" name="upload" id="upload">
<label>Fichiers XML en session <input type="file" size="50" name="xml" id="file_xml" onchange="this.form.submit()"/></label>
<input name="ajouter" type="submit" value="Ajouter"/>
<input name="vider" type="submit" value="Vider"/>
</form>
';
if(isset($_POST['vider'])) {
self::sessionClean();
print "<h2>Session vidée</h2>";
}
else {
echo "<div>";
foreach($_SESSION['xml'] as $name=>$file)
echo " – $name (" , self::sizeH(filesize($file)) , "o)";
echo "</div>";
}
}
/**
* Empty the session of filelist
*/
static function sessionClean() {
// try to delete tmp files
if (isset($_SESSION['xml'])) foreach($_SESSION['xml'] as $file) {
if (strpos(basename($file), 'xmlstats_')!==0); // not one of our temp file
else if (!file_exists($file)); // file do not exists
else unlink($file);
}
$_SESSION['xml']=array();
}
/**
*
*/
static function action() {
}
/**
* Welcome message
*/
static function welcome() {
print '
<h2>Pourquoi</h2>
<ul>
<li>2 Mo d’XML, cela fait un texte de combien de caractères ?</li>
<li>Est-ce que la balise <toto> est vraiment utilisée ?</li>
<li>Quelle est la part du texte cité (ex : <quote>) ?</li>
<li>Quels sont les mots les plus fréquents du corpus ?</li>
<li>Comment repérer les erreurs de termes dans un attribut ?</li>
</ul>
';
if (count(self::$glob)) {
print '<h2>Exemples</h2>
<ul>';
foreach(self::$glob as $name => $file) print '<li><a href="?corpus='.$name.'&taglist=">'.$name.'</a></li>';
print '</ul>';
}
}
/**
* Display nicer File size
*/
static function sizeH ($size, $mod=1024) {
$units=array('', 'K', 'M', 'G', 'T', 'P');
for ($i = 0; $size > $mod; $i++) {
$size /= $mod;
}
$num=round($size, 2);
$num=number_format($num, 2, ',', ' ');
// str_repeat(' ', 9 - mb_strlen($num)) ? espaces ?
return $num . ' ' . $units[$i];
}
}
// Generate special formats data
$format=(isset($_REQUEST['format'])) ? $_REQUEST['format'] : null;
// specific format, generate and exit, break the caller
if($format=="txt") {
header("Content-type: text/plain; charset=UTF-8");
xmlstats::body();
exit;
}
if($format=="csv") {
header ("Content-Type: application/csv");
$corpus=(isset($_REQUEST['corpus']) && $_REQUEST['corpus']) ? $_REQUEST['corpus'] : 'xmlstats';
header("Content-Disposition: attachment; filename=$corpus.csv");
xmlstats::body();
exit;
}
// included file, do nothing
if (isset($_SERVER['SCRIPT_FILENAME']) && realpath($_SERVER['SCRIPT_FILENAME']) != realpath(__FILE__));
else if (isset($_SERVER['ORIG_SCRIPT_FILENAME']) && realpath($_SERVER['ORIG_SCRIPT_FILENAME']) != realpath(__FILE__));
// direct command line call, work
else if (php_sapi_name() == "cli") {
array_shift($_SERVER['argv']); // shift first arg, the script filepath
if (!count($_SERVER['argv'])) {
echo "Quel fichier ou dossier traiter ? Exemple : ../corpus/*.xml\n";
$glob = trim(fgets(STDIN));
}
else $glob=$_SERVER['argv'][0];
if ($glob == 'test') {
xmlxtats::test();
exit;
}
echo "\nStatistiques XML pour $glob";
$stats=new xmlstats();
foreach(glob($glob) as $file) {
echo "\n$file";
$stats->parse($file);
}
$stats->report();
}
// direct http call
else {
?>
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
<title>Statistiques XML</title>
<link rel="stylesheet" type="text/css" href="http://obvil.github.io/theme/obvil.css" />
</head>
<body>
<div id="center">
<header id="header">
<h1>
<a href="../">Développements</a>
</h1>
</header>
<div id="contenu">
<h1><a href=".">XML stats</a></h1>
<p class="byline">par Frédéric Glorieux</p>
<!-- XML STATS -->
<?php
xmlstats::body(); ?>
</body>
</html>
<?php
}
?>