-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparser.c
executable file
·759 lines (672 loc) · 22.7 KB
/
parser.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
/*Program to implement a POSIX based regex parser which performs the following operations
. Any single character. Example: h.t matches hat, hit, hot and hut.
[ ] Any one of the characters in the brackets, or any of a range of characters separated by a hyphen (-), or a character class operator
(see below). Examples: h[aeiou][a-z] matches hat, hip, hit, hop, and hut; [A-Za-z] matches any single letter; x[0-9] matches x0, x1,
…, x9.
[^] Any characters except for those after the caret "^". Example: h[^u]t matches hat, hit, and hot, but not hut.
^ The start of a line (column 1).
$ The end of a line (not the line break characters). Use this for restricting matches to characters at the end of a line. Example: end$
only matches "end" when it's the last word on a line, and ^end only matches "end" when it's the first word on a line.
\< The start of a word.
\> The end of a word.
\t The tab character.
\f The page break (form feed) character.
\n A new line character, for matching expressions that span line boundaries. This cannot be followed by operators '*', '+' or {}. Do not
use this for constraining matches to the end of a line. It's much more efficient to use "$".
* Matches zero or more of the preceding characters or expressions. Example: ho*p matches hp, hop and hoop.
? Matches zero or one of the preceding characters or expressions. Example: ho?p matches hp, and hop, but not hoop.
+ Matches one or more of the preceding characters or expressions. Example: ho+p matches hop, and hoop, but not hp.
{count} Matches the specified number of the preceding characters or expressions. Example: ho\{2\}p matches hoop, but not hop.
{min,} Matches at least the specified number of the preceding characters or expressions. Example: ho\{1,\}p matches hop and hoop, but
not hp.
{min,max} Matches between min and max of the preceding characters or expressions. Example: ho\{1,2\}p matches hop and hoop, but not hp or hooop.
\| Matches either the expression to its left or its right. Example: hop\|hoop matches hop, or hoop.
\ "Escapes" the special meaning of the above expressions, so that they can be matched as literal characters. Hence, to match a literal "
\", you must use "\\". Example: \< matches the start of a word, but \\< matches "\<".
Character Class Operators "[: ... :]":
These can be used in class expressions as an alternative way of representing classes of characters. For example, [a-z0-9] is equivalent to [[:lower:][:digit:]]. (Note the extra pairs of brackets.) The defined classes are:
Expression: Description:
[:alpha:] Any letter.
[:lower:] Any lower case letter.
[:upper:] Any upper case letter.
[:alnum:] Any digit or letter.
[:digit:] Any digit.
[:xdigit:] Any hexadecimal digit (0-9, a-f or A-F).
[:blank:] Space or tab.
[:space:] Space, tab, vertical tab, return, line feed, form feed.
*/
#include<stdio.h>
#include<string.h>
#include<stdlib.h>
#define SPACE 32
#define LINE 10
#define TAB 9
#define FORM 12
#define SLASH 92
#define MAX_REGEX_LENGTH 1024 + 400
#define MAX_STRING_LENGTH 2048 + 400
//The largest regex string would be 1024 and the largest compare string would be 2048.
/* match: search for regexp anywhere in text */
int extra_length = 0;
int match(char *regexp, char *text)
{
/*
The main part of the algorithm which keeps calling matchhere for every character of regex with the entered string
*/
char newtext[MAX_STRING_LENGTH + 1], newregex[MAX_REGEX_LENGTH + 1];
int index = 0;
if (regexp[0] == '^')
return matchhere(regexp+1, text);
if (regexp[0] == '/' && regexp[1] == '<')//Used to match the start of a word
{
newtext[0] = ' ';/* The first word in the text is prefixed with a space*/
strcpy(newtext + 1, text);
newregex[0] = ' ';/* The regex is also prefixed with a space so that it only matches words prefixed with a space(i.e start of words)*/
strcpy(newregex + 1, regexp + 2);
//printf("regex=%s\nstring=%s\n", newregex, newtext);
regexp = newregex;
text = newtext;
}
do { /* must look even if string is empty */
if (matchhere(regexp, text))
{
/* printf("\nPreprocessed regular expression %s", regexp);*/
/* printf("\nPart of the string matched -> ");*/
for (int iter=0; iter < strlen(text) - extra_length; iter ++)
printf("%c", *(text + iter));
/* printf("\n");*/
return 1;
}
index ++;
} while (*text++ != '\0');
return 0;
}
int getvaluelabel(char *c, int j, char *label, int mask)
{
/*
Used for [:expressions:]
Input: c containing the expression label
Result: c is replaced with the content denoted by the label
Output: returns the length of expanded c
*/
int k = 0;
if (!strcmp(label, "alpha"))
{
j = getvaluelabel(c, j, "lower", mask);
j = getvaluelabel(c, j, "upper", mask);
}
else if(!strcmp(label, "lower"))
{
if(!mask)
c[j] = 97;
else
c[j] = c[0];
k = 0;
while(k < 26)
if(!mask)
c[++j] = 97 + (++k);
else
c[++j] = c[0];
}
else if(!strcmp(label, "upper"))
{
if(!mask)
c[j] = 65;
else
c[j] = 0;
k = 0;
while(k < 26)
if(!mask)
c[++j] = 65 + (++k);
else
c[++j] = c[0];
}
else if(!strcmp(label, "alnum"))
{
j = getvaluelabel(c, j, "digit", mask);
j = getvaluelabel(c, j, "alpha", mask);
}
else if(!strcmp(label, "digit"))
{
if(!mask)
c[j] = 48;
else
c[j] = c[0];
k = 0;
while(k < 10)
if(!mask)
c[++j] = 48 + (++k);
else
c[++j] = c[0];
}
else if(!strcmp(label, "xdigit"))
{
j = getvaluelabel(c, j, "digit", mask);
if(!mask)
c[j] = 65;
else
c[j] = c[0];
k = 0;
while(k < 6)
if(!mask)
c[++j] = 65 + (++k);
else
c[++j] = c[0];
c[j] = 97;
k = 0;
while(k < 6)
if(!mask)
c[++j] = 97 + (++k);
else
c[++j] = c[0];
}
else if(!strcmp(label, "blank"))
{
if(!mask)
{
c[j] = SPACE;
c[++j] = TAB;
}
else
c[j] = c[++j] = c[0];
}
else if(!strcmp(label, "space"))
{
j = getvaluelabel(c, j, "blank", mask);
if(!mask)
c[++j] = FORM;
else
c[++j] = c[0];
}
else if(!strcmp(label, "cntrl"))
{
k = 0;
while( k < SPACE)
if(!mask)
c[j++] = k;
else
c[j++] = c[0];
}
return j;
}
/* matchhere: search for regexp at beginning of text */
int matchhere(char *regexp, char *text)
{
/*
A recursive function which recurses, consuming the text and matching everything with the regex
*/
char c[MAX_REGEX_LENGTH + 500], temp, label[8];
int i, j, k;
if (regexp[0] == '\0')
{
extra_length = strlen(text);
return 1;
}
if (regexp[0] == '[') //Dealing with square brackets
{
j = 0;
i = 1;
if(regexp[1] != '^')
{
while (regexp[i] != ']')
{
if (regexp[i] == ':')
{
k = 0;
i++; // skip the :
while( regexp[i] != ':')
label[k++] = regexp[i++];
label[k] = '\0';
i++;
/* printf("label=%s\n", label);*/
j = getvaluelabel(c, j, label, 0);
}
else if(regexp[i] == '[')
{
i += 2; //skip the : and [
k = 0;
while( regexp[i] != ':')
label[k++] = regexp[i++];
label[k] = '\0';
i+=2; //skip the : and ]
j = getvaluelabel(c, j, label, 0);
}
else if (regexp[i] == '-') //When we want a range
{
temp = regexp[i-1] + 1;
while(temp < regexp[i+1])
c[j++] = temp++;
i += 1;
}
else
c[j++] = regexp[i++];
}
c[j] = '\0';
}
else if(regexp[1] == '^') //Dealing with complements inside brackets
{
j = 0;
//First take all the ascii values into c, all these are allowed
while(j <= 255)
if ((char)j != '\0' && (char)j != '.')
c[j] = (char)j++;
else
c[j] = j++ - 1;
c[j] = '\0';
j = 0;
i = 2;
while (regexp[i] != ']')
{
//Replace some junk character(smiley) in place of those characters which are not allowed (as marking, don't consider)
if(regexp[i] == '[')
{
i += 2; //skip the :
k = 0;
while( regexp[i] != ':')
label[k++] = regexp[i++];
label[k] = '\0';
i+=2; //skip the : and ]
j = getvaluelabel(c, j, label, 1);
}
else if (regexp[i] == '-')
{
temp = regexp[i-1] + 1;
while(temp < regexp[i+1])
c[temp++] = c[0];
i += 1;
}
else
c[regexp[i++]] = c[0];
}
}
if (regexp[i + 1] == '*')
{
extra_length = strlen(text);
return matchstarbrackets(c, regexp + i + 2, text);
}
else if (regexp[i + 1] == '+')
{
extra_length = strlen(text);
return matchplusbrackets(c, regexp + i + 2, text);
}
else if (regexp[i + 1] == '?')
{
extra_length = strlen(text);
return matchquebrackets(c, regexp + i + 2, text);
}
else if (isin(text++, c))
{
extra_length = strlen(text);
return matchhere(regexp + i + 1, text);
}
else
return 0;
}
if (regexp[1] == '?')
{
extra_length = strlen(text);
return matchquemark(regexp[0], regexp+2, text);
}
if (regexp[1] == '*')
{
extra_length = strlen(text);
return matchstar(regexp[0], regexp+2, text);
}
if (regexp[1] == '+')
{
extra_length = strlen(text);
return matchplus(regexp[0], regexp+2, text);
}
if (regexp[0] == '$' && regexp[1] == '\0')
{
extra_length = strlen(text);
return *text == '\0';
}
if (regexp[0] == '/' && regexp[1] == '>' && regexp[2] == '\0')
{
extra_length = strlen(text);
return *text == ' ' || *text == '\0';
}
if (*text!='\0' && (regexp[0]=='.' || regexp[0]==*text))
{
extra_length = strlen(text + 1);
return matchhere(regexp+1, text+1);
}
return 0;
}
int matchquemark(int c, char *regexp, char *text)
{
if(matchhere(regexp, text))
return 1;
else if((*text++ == c || c == '.'))
return matchhere(regexp, text);
return 0;
}
int matchquebrackets(char *c, char *regexp, char *text)
{
if(matchhere(regexp, text))
return 1;
else if(isin(text++, c))
return matchhere(regexp, text);
return 0;
}
/* matchstar: search for c*regexp at beginning of text */
int matchstar(int c, char *regexp, char *text)
{
do { /* a * matches zero or more instances */
if (matchhere(regexp, text))
return 1;
} while (*text != '\0' && (*text++ == c || c == '.'));
return 0;
}
int isin( char *text, char *c)
{
int i = 0, l;
l = strlen(c);
while(i < l)
{
if (c[i] == '.')
return 1;
else if((int)*text == (int)c[i])
return 1;
i++;
}
return 0;
}
int matchstarbrackets(char *c, char *regexp, char *text)
{
do { /* a * matches zero or more instances */
if (matchhere(regexp, text))
return 1;
} while (*text != '\0' && (isin(text++, c)));
return 0;
}
int matchplusbrackets(char *c, char *regexp, char *text)
{
while (*text != '\0' && (isin(text++, c)))
{ /* a * matches zero or more instances */
if (matchhere(regexp, text))
return 1;
}
return 0;
}
/* matchstar: search for c*regexp at beginning of text */
int matchplus(int c, char *regexp, char *text)
{
while (*text != '\0' && (*text++ == c || c == '.'))
if (matchhere(regexp, text))
return 1;
return 0;
}
void preprocessfreq(char *reg, int length)
{
/*
To deal with {count}, {min, }, {min, max} operators we follow the logic as per the below example,
Assume the regex is abc
For abc\{2\} we transform regex to abcc
For abc\{2,\} we transform it to abcc+
For abc\{2,5\} we transform it to abccc?c?c?
The above logic is implemented as follows:
Implement the part for {min, max}
For count, do min = max and use the same logic as {min,max}
For {min}, do max = -1 and make it *
This function does the above transform
Input: The regex and its length
Result: The regex is transformed
*/
int i, j, k, x, ib, jb, iter, min, max, expand, level;
char temp[MAX_REGEX_LENGTH], num[10];
i = 0;
j = 0;
k = 0;
x = 0;
ib = 0;
jb = 0;
iter = 0;
while(i < length)
{
if(i < (length - 1) && reg[i] == SLASH)
{
if(reg[i + 1] == '{') // Found a pattern \{
{
//We have entered the part where
expand = i - 1;
i+=2; //skip slash and {
//Obtain the number in num
k = 0;
while(reg[i] != ',' && reg[i] != SLASH)
num[k++] = reg[i++];
num[k] = '\0';
if(reg[i] == SLASH) //This means we are dealing with count
min = max = atoi(num);
else
{
min = atoi(num);
if(reg[i + 1] != SLASH) //This means we are dealing with {min,max}
{
i++;//skip the comma
//Obtain the number in num
k = 0;
while(reg[i] != SLASH)
num[k++] = reg[i++];
num[k] = '\0';
max = atoi(num);
}
else //This means we are dealing with {min,}
{
i++;//skip the comma
max = -1;
}
}
i+=2; //skipping slash and }
//We are done finding the values of min and max, now we proceed to transform the regex as per the logic
if(reg[expand] != ']') //When the character before the {min, max} type function is not ']' => It is a single character
{
//first expand min no. of times
for(iter=0;iter<min-1;iter++)
temp[x++] = reg[expand];
//Expanding with question mark
for(;iter<max-1;iter++)
{
temp[x++] = reg[expand];
temp[x++] = '?';
}
//Expand with asterix for {min, }
if(max == -1)
temp[x++] = '+';
}
else
{
/*
When the character preceeding the {min, max} type function is a ']', we first find the opening brace corresponding
to it. Next we repeat the entire string from the '[' till ']' of that pattern
*/
level = 1;//This is used to keep track of level of '['
//Find the opening brace
ib = expand;
while(level != 0)
{
ib--;
if(reg[ib] == '[')
level--;
else if(reg[ib] == ']')
level++;
}
//now ib is pointing at [ corresponding to the ']'
//Expanding min times
for(iter=0;iter<min-1;iter++)
{
for(jb=ib;jb<=expand;jb++)
temp[x++] = reg[jb];
}
//Expanding with question mark for {min, max}
for(;iter<max-1;iter++)
{
for(jb=ib;jb<=expand;jb++)
temp[x++] = reg[jb];
temp[x++] = '?';
}
//Expand with asterix for {min, }
if(max == -1)
temp[x++] = '+';
}
}
}
else
temp[x++] = reg[i++]; //When that function is absent, we simply copy
}
temp[x] = '\0';
//Now we copy the entire message to regex
i = 0;
while(temp[i] != '\0')
reg[i] = temp[i++];
reg[i] = '\0';
}
int get_hex(char *str)
{
return (int)strtol(str, 0, 16);
}
void preprocess(char *str, int length, int flag)
{
/*
If the string contains characters \n,\t,\f,\\ they are replaced with their ASCII values -> Type-1
Occurences of \{, \<, \>, \} are retained as is. -> Type-2
Input: Unfiltered string, its length, flag
flag = 1 => the string is a regex, else it is a string
Result: The string is filtered
*/
int i, j, dec_flag, k;
char temp[MAX_STRING_LENGTH], num[10];
i = 0;
j = 0;
//We create the filtered regex in variable temp and later copy the value of temp to regex variable reg
while(i < length)
{
if(i < (length - 1) && str[i] == SLASH)
{
if(str[i+1] == 'n') //If it is a \n
{
temp[j++] = LINE;
i+=2;//skip \ and the character after it
}
else if(str[i+1] == 't') // If it is \t
{
temp[j++] = TAB;
i+=2;//skip \ and the character after it
}
else if(str[i+1] == 'f') // \f
{
temp[j++] = FORM;
i+=2;//skip \ and the character after it
}
else if((str[i+1] == 'd' || str[i+1] == 'x') && flag) //This preprocessing has to be done ony for regex
{
dec_flag = (str[i+1] == 'd'); //sets dec_flag to true if it is a decimal we are dealing with
i += 2; //Go to the character after 'x' or 'd'
//Now we store the character to obtain the ascii value
k = 0;
while(str[i] != SLASH)
num[k++] = str[i++];
num[k] = '\0';
i+=1;//There is a line doing i +=
//We convert the string to its equivalent ascii
if(dec_flag)
temp[j++] = (char)atoi(num);
else
temp[j++] = (char)get_hex(num);
}
else if(str[i+1] == SLASH && flag == 1) //When it is a regex, \\ has to be replaced with ASCII of slash
{
temp[j++] = SLASH;
i+=2;//skip \ and the character after it
}
else
{
//When it is of type-2
i--; // We decrement so that i+=2 does not skip the character after \ too
temp[j++] = SLASH;
i+=2;//skip \ and the character after it
}
}
else
temp[j++] = str[i++]; //Any other case just copy the regex
}
temp[j] = '\0';
i = 0;
//We replace reg with temp
while(temp[i] != '\0')
str[i] = temp[i++];
str[i] = '\0';
if(flag)//If its a regex, we need an extra step of preprocessing to be done
preprocessfreq(str, (int)strlen(str));
}
int buildarray(char regarray[][MAX_REGEX_LENGTH], char *str, int n)
{
/*
Divdes the regex string into groups depending on location of \|
Takes as input string str= reg1 \| reg2
Operation: makes array regarray= {reg1, reg2}
Returns output: No. of elements in regarray
*/
int i, j, k;
i = 0;
j = 0;
k = 0;
while(i < n)
{
if(i < (n - 1) && str[i] == SLASH && str[i + 1] == '|') // checking for \|
{
i += 2;
j++;
regarray[j][k] = '\0';
k = 0;
}
else
regarray[j][k++] = str[i++];
}
return (j + 1); // returning number of groups
}
int CompareString(char temp_reg[], char str[])
{
char regarray[50][MAX_REGEX_LENGTH];
int numgroups, i, j;
// printf("enter regex and string:\n");
// scanf("%s%s", temp_reg, str);
/*
To deal with \| operator, we input the regex string, check for the presence of \|, if present, it implies that we have to check
whether the string matches either of regex. We call the match function on each regex and do a boolean OR of the result.
Eg: reg1 \| reg2, forms an array such that regarray = {reg1, reg2}, numgroups=2 and match function is called on reg1 and if it
failes then on reg2. If one of them passes "found" is printed
*/
numgroups = buildarray(regarray, temp_reg, (int)strlen(temp_reg));//Gives the number of regexes to implement OR operation
i = 0;
/* Multiple regexes separated by or \|*/
while(i<numgroups)
{
//We do a bunch of preprocessing
preprocess(regarray[i], (int)strlen(regarray[i]), 1);
preprocess(str, (int)strlen(str), 0);
if(match(regarray[i], str))
{
return 1;
}
else
{
i++;
}
}
//printf("Not found\n");
return 0;
}
int main(int argc, char *argv[])
{
char strCompare[MAX_STRING_LENGTH], regexCompare[MAX_REGEX_LENGTH];
if (CompareString(argv[1], argv[2]) == 1)
{
return 0;//printf("\nMatch!\n");
}
else
{
return 1;//printf("\nNo match!\n");
}
}