-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathrdf-wilbur.lisp
222 lines (196 loc) · 7.94 KB
/
rdf-wilbur.lisp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
(in-package :conllu.rdf)
(defun convert-features-to-rdf (features-string)
"Input: string
Output: list of feature nodes
Returns a list of nodes to be used as objects in triples with the predicate \"conll:feats\".
Examples:
(let ((wilbur:*nodes* (make-instance 'wilbur:dictionary))
(wilbur:*db* (make-instance 'wilbur:db)))
(wilbur:add-namespace \"olia-sys\" \"http://purl.org/olia/system.owl#\")
(wilbur:add-namespace \"conll\" \"http://br.ibm.com/conll/LEMMA\")
(convert-features-to-rdf \"Mood=Ind|Tense=Past|VerbForm=Fin\" (node \"sentence1\")))
=>
(!\"http://br.ibm.com/conll/LEMMA#verbFormFin\"
!\"http://br.ibm.com/conll/LEMMA#tensePast\"
!\"http://br.ibm.com/conll/LEMMA#moodInd\")"
(if (equal features-string
"_")
'()
(let ((node-list nil))
(dolist (feat-pair (split-sequence #\| features-string :remove-empty-subseqs features-string) node-list)
(destructuring-bind (name value) (split-sequence #\= feat-pair :count 2)
(push
(node (format nil "conll:~a~a"
(concatenate 'string
(string-downcase (subseq name 0 1))
(subseq name 1))
value))
node-list))))))
(defun convert-token-to-rdf (token sentence-id sentence-node)
(let* ((token-node (node (format nil "NAMESPACE:~a-~a" sentence-id (slot-value token 'cl-conllu::id))))
(slots '(id form lemma upostag xpostag feats head deprel deps))
(slot-nodes
(list
'id `(,(wilbur:literal (slot-value token 'cl-conllu::id)))
'form `(,(wilbur:literal (slot-value token 'cl-conllu::form)))
'lemma `(,(wilbur:literal (slot-value token 'cl-conllu::lemma)))
'upostag `(,(wilbur:literal (slot-value token 'cl-conllu::upostag)))
'xpostag `(,(wilbur:literal (slot-value token 'cl-conllu::xpostag)))
'feats (convert-features-to-rdf (slot-value token 'cl-conllu::feats))
'head `(,(wilbur:literal (slot-value token 'cl-conllu::head)))
'deprel `(,(wilbur:literal (slot-value token 'cl-conllu::deprel)))
'deps `(,(wilbur:literal (slot-value token 'cl-conllu::deps))))))
`(,(wilbur:triple token-node
(node "rdf:type")
(node "nif:Word"))
,(wilbur:triple token-node
(node "conll:inSentence")
sentence-node)
,@(mappend
#'(lambda (slot)
(mapcar
#'(lambda (value-node)
(wilbur:triple
token-node
(node (format nil "conll:~a" (string-upcase slot)))
value-node))
(getf slot-nodes slot)))
slots))))
(defun convert-sentence-metadata (metadata sentence-node)
"Input: list of pairs (name value), node
Output: List of triples
Example:
(let ((wilbur:*db* (make-instance 'wilbur:db))
(metadata '((\"sent_id\" . \"test\")
(\"text\" . \"The US troops fired into the hostile crowd, killing 4.\")))
(sentence-node (node \"sentence\")))
(convert-sentence-metadata metadata sentence-node))
=>
((#<WILBUR:TRIPLE !\"sentence\" !conll:metadata/sent_id #\"test\" {10048AA2E3}>)
(#<WILBUR:TRIPLE !\"sentence\" !conll:metadata/text #\"The US troops fired into the hostile crowd, killing 4.\" {10048AA753}>))"
(mapcar
#'(lambda (pair)
(cond
((stringp (car pair))
(wilbur:triple
sentence-node
(node (format nil "conll:metadata/~a" (car pair)))
(wilbur:literal (cdr pair))))
((equal (cdr pair) :none)
(wilbur:triple
sentence-node
(node "conll:metadata")
(wilbur:literal (cdr pair))))
(t
(cerror "Ignore undetermined case."
"Indetermined metadata case: ~a" pair))))
metadata))
(defun convert-sentence-to-rdf (sentence text sentence-id corpus-id-node)
(let ((sentence-id-node (node
(format nil "NAMESPACE:~a" sentence-id))))
`(,(wilbur:triple corpus-id-node
(node "conll:sentences")
sentence-id-node)
,(wilbur:triple sentence-id-node
(node "conll:corpus")
corpus-id-node)
,(wilbur:triple sentence-id-node
(node "rdf:type")
(node "conll:Sentence"))
,(wilbur:triple sentence-id-node
(node "rdfs:label")
(wilbur:literal text))
;; sentence metavalues
,@(convert-sentence-metadata (sentence-meta sentence)
sentence-id-node)
,@(mappend
#'(lambda (token)
(convert-token-to-rdf token sentence-id sentence-id-node))
(sentence-tokens sentence)))))
(defun convert-to-rdf (sentences &key (text-fn #'sentence-text) (id-fn #'sentence-id) (corpusname "my-corpus")
(namespace-string "http://www.example.org/") (stream *standard-output*)
(rdf-format :ntriples) (conll-namespace "http://br.ibm.com/conll/"))
"Converts a list of sentences (e.g. as generated by READ-CONLLU)
in SENTENCES, using the function TEXT-FN to extract the text of each
sentence and ID-FN to extract the id of each sentence (we need this
as there is no standardized way of knowing this.)
Currently only ntriples is supported as RDF-FORMAT."
(let* ((wilbur:*nodes* (make-instance 'wilbur:dictionary))
(wilbur:*db* (make-instance 'wilbur:db))
(namespaces `(("conll" ,conll-namespace)
("rdf" "http://www.w3.org/1999/02/22-rdf-syntax-ns#")
("rdfs" "http://www.w3.org/2000/01/rdf-schema#")
("dc" "http://purl.org/dc/elements/1.1/")
("dcterms" "http://purl.org/dc/terms/")
("skos" "http://www.w3.org/2004/02/skos/core#")
("owl" "http://www.w3.org/2002/07/owl#")
("nif" "http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#")
("terms" "http://purl.org/acoli/open-ie/")
("olia-sys" "http://purl.org/olia/system.owl#")
("NAMESPACE" ,namespace-string))) ; "NAMESPACE" will be a prefix to represent local namespace
(corpus-id (uuid:make-v4-uuid))
(corpus-id-node (wilbur::unresolved-node (format nil "NAMESPACE:c~a" corpus-id))))
;; Inserts namespaces
(dolist (namespace namespaces)
(wilbur:add-namespace
(first namespace)
(second namespace)))
;; Produces and inserts triples
(mapc #'wilbur:add-triple
`(,(wilbur:triple corpus-id-node
(node "rdf:type")
(node "conll:Corpus"))
,(wilbur:triple corpus-id-node
(node "rdfs:label")
(wilbur:literal corpusname))
,@(mappend
#'(lambda (sentence)
(convert-sentence-to-rdf
sentence
(funcall text-fn sentence)
(funcall id-fn sentence)
corpus-id-node))
sentences)))
;; Serializes
;; the same data can't be serialized by wilbur in both ntriples and rdf/xml.
;; thus, we are only using ntriples.
(ecase rdf-format
;; ((:rdf/xml :xml)
;; (wilbur::dump-as-rdf/xml (reverse
;; (wilbur:db-triples wilbur:*db*))
;; stream
;; (wilbur:namespaces)))
(:ntriples
(wilbur::dump-as-ntriples (reverse
(wilbur:db-triples wilbur:*db*))
stream)))))
(defun node (arg)
"Input: ARG
Output: (node)
Creates a new node. If ARG is a string, expands ARG with namespaces contained in
dynamic variable *wilbur:nodes*"
;; (typecase arg
;; (string
;; (wilbur:node
;; (wilbur:expand-name-with-namespace
;; arg
;; (wilbur:dictionary-namespaces wilbur:*nodes*))))
;; (number
;; (wilbur:node (format nil "~a" arg)))
;; (t
;; (wilbur:node arg))))
(wilbur::unresolved-node arg))
;; ==============
;; Some useful information/snippets about wilbur:
;;
;; (setq *readtable* (copy-readtable nil)) ;; wilbur changes the
;; readtable, this reverts it (e.g. if having problems with
;; exclamation mark (!)
;; wilbur:add-namespace prefix uri
;; (setf wilbur:*db* (make-instance 'wilbur:db))
;; (wilbur:triple (wilbur:node ...) (wilbur:node ...) (wilbur:node ...))
;; (wilbur:expand-name-with-namespace NAME-STRING (wilbur:dictionary-namespaces *nodes*))
;; (setf wilbur:*db* (make-instance 'wilbur:db))
;; Wilbur uses important dynamic variables:
;; *nodes* is special variable of class DICTIONARY that contains namespaces
;; *db* is a special variable corresponding to the current database