-
Notifications
You must be signed in to change notification settings - Fork 18
/
Copy pathwikipatterns.properties
95 lines (90 loc) · 6.78 KB
/
wikipatterns.properties
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#-----------------------------------------------------------
# T A G M E W I K I P A T T E R N S
#-----------------------------------------------------------
#
#A file contaning some useful pattern to identify special pages and categories from wikipedia.
#All values are computed as standard Java regular expression and lower case is applied,
#otherwise different behaviour is specified. Charset must be UTF-8.
#Be carefull that values are trimmed, so if you want to include a blank char at the beginning
#of the value, you have to escape it (\u0020).
#Moreover you have to escape backslashes doubling them (\\).
#See http://java.sun.com/javase/6/docs/api/java/util/Properties.html#load(java.io.Reader)
#for further details.
#
#-------------------
# K E Y S
#-------------------
#<lang>.disamb_cat
# identifies categories that links every disambiguation page.
# Only direct links are catched, no category graph visit is performed.
# Match is performed against the category title with lowercasing
#
#<lang>.disamb_suffix
# identifies pages that have a disambiguation suffix in their title.
# Match is performed against the whole title of the page, but lowercase is not applied
#
#<lang>.list_cat
# identifies categories that links every list page. This is often an heuristic match...
# Only direct links are catched, no category graph visit is performed.
# Match is performed against the category title with lowercasing
#
#<lang>.people_cat
# identifies categories that links every page of persons. This is often an heuristic match...
# Only direct links are catched, no category graph visit is performed.
# Match is performed against the category title with lowercasing
#
#<lang>.anchor_start
# identifies a pattern at the beginning of each anchor that is ignored and removed from the anchor text.
# Match is performed against the anchor that is lowercased and ASCII-normalized
#
#<lang>.anchor_stopwords
# identifies a comma-separated listo of words that have to be removed from an anchor.
# Match is performed against the anchor that is lowercased and ASCII-normalized and punctuations were removed
#
#<lang>.page_date
# identifies a pattern of all possible calendar date expression, for discarding those pages that are dates
# Match is performed against the whole title, non-lowercased
#
#<lang>.page_ignore
# identifies a pattern of pages that as to be ignored
# Match is performed against the whole title, non-lowercased
de.disamb_cat=(begriffskl\u00E4rung)|(m\u00E4nnlicher vorname)|(weiblicher vorname)|(familienname)
de.disamb_suffix=\u0020\\([Bb]egriffskl\u00E4rung\\)$
de.list_cat=^((liste .*)|(.* liste))$
de.people_cat=^((geboren \\d{1,4})|(gestorben \\d{1,4})|(person nach geburtsjahrhundert)|(person nach todesjahrhundert)|(frau)|(mann)|(geschlecht unbekannt)|(intersexueller))$
de.anchor_start=^(das |dem |den |der |des |die |ein |eine |einem |einen |einer |eines )
de.anchor_stopwords=das,dem,den,der,des,die,denn,keine,keinen,keiner,beim,ich,ein,eine,einem,einen,einer,einesals,am,an,auf,aus,bei,bis,durch,fuer,gen,im,in,indem,mit,mittels,nach,neben,per,pro,seit,uber,um,und,unter,von,zu,zum,viele,finden,daher,darf,doch,ihre,sich,allem,meist,ware,kenne,gut,guten,gute,gutes,er,nicht,will,euch,schlecht,mein,schon,tun,gemacht,oder,hier,mich,dich
de.page_date=^((\\d{1,4})( v[.] Chr[.])?|([Jj]anuar|[Ff]ebruar|[Mm]\u00E4rz|[Aa]pril|[Mm]ai|[Jj]uni|[Jj]uli|[Aa]ugust|[Ss]eptember|[Oo]ktober|[Nn]ovember|[Dd]ezember)( \\d{1,4})?( v[.] Chr[.])?)$
de.page_ignore=^(([Mm]inus eins)|(Null)|(Eins)|(Zwei)|(Drei)|(Vier)|(F\u00FCnf)|(Sechs)|(Sieben)|(Acht)|(Neun)|(Zehn)|(Elf))$
en.disamb_cat=(all article disambiguation pages)|(all set index articles)
en.disamb_suffix=\u0020\\([Dd]isambiguation\\)$
en.list_cat=^((lists .*)|(.* lists))$
en.people_cat=^((\\d{1,4} births)|(\\d{1,4} deaths)|(living people))$
en.anchor_start=^(a |an |the )
en.anchor_stopwords=the,a,an,by,for,from,and,or,with,on,to,in
en.page_date=^((\\d{1,4})( BC)?|([Jj]anuary|[Ff]ebruary|[Mm]arch|[Aa]pril|[Mm]ay|[Jj]une|[Jj]uly|[Aa]ugust|[Ss]eptember|[Oo]ctober|[Nn]ovember|[Dd][ie]cember)( \\d{1,4}( BC)?)?)$
en.page_ignore=^([-\u2212]?\\d+ \\([Nn]umber\\))$
es.disamb_cat=(Wikipedia:desambiguaci\u00F3n)
es.disamb_suffix=\u0020\\([Dd]esambiguaci\u00F3n\\)$
es.list_cat=^((listas? de .*)|(anexo:.*))$
es.people_cat=^((hombres)|(mujeres)|(nacidos en .*)|(fallecidos en .*))$
es.anchor_start=^(a |e |o |y |al |de |el |en |es |ha |he |la |le |lo |me |mi |ni |no |os |se |si |su |te |ti |tu |un |ya |yo |con |del )
es.anchor_stopwords=a,e,o,y,al,de,el,en,es,ha,he,la,le,lo,me,mi,ni,no,os,se,si,su,te,ti,tu,un,ya,yo,con,del,era,esa,ese,eso,fue,fui,han,has,hay,las,les,los,mas,mia,mio,mis,muy,nos,por,que,sea,sin,son,soy,sus,tus,una,uno,algo,ante,como,cual,ella,eran,eras,eres,esas,esos,esta,este,esto,haya,hube,hubo,mias,mios,nada,otra,otro,para,pero,poco,sean,seas,sera,sere,sido,sois,suya,suyo,todo,tuve,tuvo,tuya,tuyo,unos
es.page_date=^((([1-3]?\\d de )?([Ee]nero|[Ff]ebrero|[Mm]arzo|[Aa]bril|[Mm]ayo|[Jj]unio|[Jj]ulio|[Aa]gosto|[Ss]eptiembre|[Oo]ctubre|[Nn]oviembre|[Dd]iciembre)( ?\\d{1,4}( a[.] C[.])?)?)|(A\u00F1os? )(\\d{1,4}( a[.] C[.])?)|(A\u00F1o cero))$
es.page_ignore=^(([-\u2212]?\\d+ \\([Nn]\u00FAmero\\))|(Menos uno)|(Cero)|(Uno)|(Dos)|(Tres)|(Cuatro)|(Cinco)|(Seis)|(Siete)|(Ocho)|(Nueve)|(Diez)|(Once)|(Doce)|(Trece)|(Catorce)|(Quince)|(Diecis\u00E9is)|(Diecisiete)|(Dieciocho)|(Diecinueve)|(Veinte)|(Veinti[^ ]*)|(Treinta)|(Cien)|(Mil)|(Mill\u00F3n)|(Millardo))$
fr.disamb_cat=(homonymie)
fr.disamb_suffix=\u0020\\([Hh]omonymie\\)$
fr.list_cat=^(listes? (de |des |du |d').*)$
fr.people_cat=^((d\u00e9c\u00E8s (en|au|\u00E0) .*)|(naissance en .*))$
fr.anchor_start=^(a |an |the )
fr.anchor_stopwords=a,au,avec,bien,dans,de,des,dois,dont,elle,elles,en,entre,et,il,ils,je,l,la,le,notamment,nous,par,qu,que,qui,sont,sur,tout,tres,tu,un,vous
fr.page_date=^((((1er|[1-3]?\\d) )?([Jj]anvier|[Ff]\u00E9vrier|[Mm]ars|[Aa]vril|[Mm]ai|[Jj]uin|[Jj]uillet|[Aa]o\u00FBt|[Ss]eptembre|[Oo]ctobre|[Nn]ovembre|[Dd]\u00E9cembre)( \\d{1,4})?( av[.] J[.]-C[.])?)|((Ann\u00E9es? |An )?([1-9][0-9]*)( av[.] J[.]-C[.])?))$
fr.page_ignore=^(([-\u2212]?\\d+ \\([Nn]ombre\\))|(Z\u00e9ro))$
it.disamb_cat=^((disambigua)|(combinazioni di 2 caratteri)|(combinazioni di 3 caratteri))$
it.disamb_suffix=\u0020\\([Dd]isambigua\\)$
it.list_cat=^(liste .*)$
it.people_cat=^((nati nel \\d{1,4})|(morti nel \\d{1,4})|(persone viventi))$
it.anchor_start=^(il |la |lo |gli |le |l'|un |un'|una |i |uno )
it.anchor_stopwords=il,la,lo,gli,le,l,un,una,uno,del,dell,della,degli,dello,delle,di,a,da,in,con,su,per,tra,fra,col,e,ed,o,i,al,alla,allo,agli,alle,dal,dalle,dallo,dagli,dalla,tutto,tutta,tutti,tutte,molto,molte,devo,deve,devi
it.page_date=^((\\d{1,4})|([1-3]?\\d[\u00BA\u00B0]? )?([Gg]ennaio|[Ff]ebbraio|[Mm]arzo|[Aa]prile|[Mm]aggio|[Gg]iugno|[Ll]uglio|[Aa]gosto|[Ss]ettembre|[Oo]ttobre|[Nn]ovembre|[Dd]icembre))$
it.page_ignore=^[-\u2212]?\\d+ \\([Nn]umero\\)$