From 282b16a811e3c86de95923bc3a420f7beae53f53 Mon Sep 17 00:00:00 2001 From: Simon Gray Date: Mon, 27 Nov 2023 14:54:18 +0100 Subject: [PATCH] 2023-11-28 release --- pages/releases-da.md | 6 +- pages/releases-en.md | 6 +- src/main/dk/cst/dannet/db/bootstrap.clj | 66 ++++++++----------- src/main/dk/cst/dannet/db/search.clj | 1 - src/main/dk/cst/dannet/query.clj | 8 --- src/main/dk/cst/dannet/query/operation.clj | 5 +- .../dk/cst/dannet/query/operation/llm.clj | 1 + src/main/dk/cst/dannet/web/components.cljc | 15 ++--- src/main/dk/cst/dannet/web/resources.clj | 9 +-- 9 files changed, 46 insertions(+), 71 deletions(-) diff --git a/pages/releases-da.md b/pages/releases-da.md index 296f39d..97eff15 100644 --- a/pages/releases-da.md +++ b/pages/releases-da.md @@ -1,8 +1,10 @@ # Versioner De nye DanNet-versioner bruger udgivelsesdatoen som versionsnummer, formatteret som `YYYY-MM-DD`. -## **SNAPSHOT**: Ordfrekvens -* Ordfrekvens fra [DDO](https://ordnet.dk/ddo) (delt af DSL) er blevet tilføjet til DanNet-datasættet. +## **2023-11-28**: Korte etiketter +* `dns:shortLabel`-varianter af de eksisterende synset-labels (udledt fra bl.a. ordfrekvenser fra [DDO](https://ordnet.dk/ddo)) er blevet tilføjet til DanNet-datasættet. +* `dns:source` bruges nu igen til at linke til oprindelige opslagskilder som f.eks. DDO. Brugen af `dc:source` var både problematisk ift. skemadefinitionen samt det irritationsmoment at `dc` i visse tilfælde skaber forvirring når det bruges som RDF-præfiks, da det kan være hardcoded til en bestemt IRI. +* Nogle sense-etiketter havde ved en fejl mistet deres sprog (@da) og dette er nu udbedret. ## **2023-09-28**: Rettelse af domain topic-relationen * `wn:has_domain_topic`-relationen har været brugt i stedet for `wn:domain_topic` i DanNet-datasættet. Dette er nu blevet rettet. diff --git a/pages/releases-en.md b/pages/releases-en.md index 4beaa71..daef4bf 100644 --- a/pages/releases-en.md +++ b/pages/releases-en.md @@ -1,8 +1,10 @@ # Releases The newer DanNet releases use the release date as the version number, formatted as `YYYY-MM-DD`. -## **SNAPSHOT**: Word frequencies -* Word frequencies from [DDO](https://ordnet.dk/ddo) (provided by DSL) have been added to the DanNet dataset. +## **2023-11-28**: Short labels +* `dns:shortLabel` variants of synset labels (derived from, amongst other things, word frequencies from [DDO](https://ordnet.dk/ddo)) have been added to the DanNet dataset. +* `dns:source` is now used once again to link to the original dictionary entry sources such as DDO. The usage of `dc:source` was both problematic wrt. its definition in the schema, as well the annoying fact that `dc` in some cases results in confusion when used as an RDF prefix as it may be hardcoded to a specific IRI. +* Some sense labels had lost their language (@da) by mistake and this has now been fixed. ## **2023-09-28**: Fixing the domain topic relation * The `wn:has_domain_topic` relation had been used in place of `wn:domain_topic` in the DanNet dataset. This has now been corrected. diff --git a/src/main/dk/cst/dannet/db/bootstrap.clj b/src/main/dk/cst/dannet/db/bootstrap.clj index f6e141c..612fe5c 100644 --- a/src/main/dk/cst/dannet/db/bootstrap.clj +++ b/src/main/dk/cst/dannet/db/bootstrap.clj @@ -98,7 +98,7 @@ "2023-09-28") (def current-release - (str old-release "-SNAPSHOT")) + (str "2023-11-28" #_"-SNAPSHOT")) (defn assert-expected-dannet-release! "Assert that the DanNet `model` is the expected release to boostrap from." @@ -268,38 +268,29 @@ (map row->triples) (doall)))) -;; TODO: remove -(defn ->freq-triples +(defn lemma-freq [[ddo_entryid _ ddo_artikeltyngde :as row]] (when (not-empty ddo_artikeltyngde) (let [word (shared/word-uri ddo_entryid) value (Integer/parseUnsignedInt ddo_artikeltyngde)] - #{[word :dns/ddoFrequency value]}))) + [word value]))) -;; TODO: remove -(defn add-word-frequency! - "Add word frequency data from DDO; useful for ranking/selecting labels." - [dataset] - (let [graph (db/get-graph dataset prefix/dn-uri) - model (db/get-model dataset prefix/dn-uri) - unlabeled (op/sparql "SELECT ?w - WHERE { - ?w dns:ddoFrequency ?f . - FILTER NOT EXISTS { - ?w rdfs:label ?l . - } - }") - triples (->> (read-triples [->freq-triples "bootstrap/other/dannet-new/artikeltyngde_bet 3.csv" - :preprocess rest]) - (apply set/union))] - (txn/transact-exec graph - (println "... adding" (count triples) "dns:ddoFreq triples") - (db/safe-add! graph triples)) - (let [unlabeled (map '?w (q/run graph unlabeled))] - (txn/transact-exec model - (println "... removing" (count unlabeled) "unlabeled word triples") - (doseq [word unlabeled] - (db/remove! model [word :dns/ddoFrequency '_])))))) +(defn mk-word->frequency + [] + (->> (read-triples [lemma-freq "bootstrap/other/dannet-new/artikeltyngde_bet 3.csv" + :preprocess rest]) + (into {}))) + +(defn mk-sense-label->word + [g] + (->> (q/run-basic g op/short-label-candidates) + (map (juxt '?label '?word)) + (remove (comp nil? second)) + (into {}))) + +(defn mk-sense-label->frequency + [g] + (comp (mk-word->frequency) (mk-sense-label->word g))) (defn fix-source-relations! "Use a custom source relation that is less strict than dc:source and less @@ -347,14 +338,6 @@ (println "... adding" (count triples) "fixed sense labels") (db/safe-add! graph triples)))) -(defn mk-sense-label->freq - "Create the sense-label->freq mapping based on the data in Graph `g`." - [g] - (->> (q/run-basic g op/short-label-candidates) - (map (juxt '?label '?freq)) - (remove (comp nil? second)) - (into {}))) - (defn abridged-synset-label [sense-label->freq synset-label] (let [labels (shared/sense-labels shared/synset-sep (str synset-label))] @@ -367,7 +350,7 @@ [dataset] (println "... locating frequency data and synset labels") (let [graph (db/get-graph dataset prefix/dn-uri) - sense-label->freq (mk-sense-label->freq graph) + sense-label->freq (mk-sense-label->frequency graph) abridged (partial abridged-synset-label sense-label->freq) triples (->> (op/sparql "SELECT ?synset ?label WHERE { @@ -391,7 +374,7 @@ This function survives between releases, but the functions it calls are all considered temporary and should be deleted when the release comes." [dataset] - (let [expected-release "2023-09-28-SNAPSHOT"] + (let [expected-release "2023-11-28"] (assert (= current-release expected-release)) ; another check (println "Applying release changes for" expected-release "...") (fix-source-relations! dataset) @@ -522,3 +505,10 @@ dataset (->dataset db-type full-db-path)] (println "WARNING: no input dir provided!") (dataset->db dataset schema-uris))))) + +(comment + ((mk-word->frequency) :dn/word-11000987) + ((mk-sense-label->word (:graph @dk.cst.dannet.web.resources/db)) "agurk_§1") + ((mk-sense-label->frequency (:graph @dk.cst.dannet.web.resources/db)) + "agurk_§1") + #_.) diff --git a/src/main/dk/cst/dannet/db/search.clj b/src/main/dk/cst/dannet/db/search.clj index b9c348c..ae1a9e9 100644 --- a/src/main/dk/cst/dannet/db/search.clj +++ b/src/main/dk/cst/dannet/db/search.clj @@ -23,7 +23,6 @@ {?ontotype ?label}) (apply merge-with q/set-merge))))))) -;; TODO: need to also numerically order by synset key, not just alphabetically (defn look-up "Look up synsets in Graph `g` based on the given `lemma`." [g lemma] diff --git a/src/main/dk/cst/dannet/query.clj b/src/main/dk/cst/dannet/query.clj index e4f1fe3..0421286 100644 --- a/src/main/dk/cst/dannet/query.clj +++ b/src/main/dk/cst/dannet/query.clj @@ -7,7 +7,6 @@ [arachne.aristotle.query :as q] [dk.cst.dannet.prefix :as prefix] [dk.cst.dannet.transaction :as txn] - [dk.cst.dannet.web.components :as com] [dk.cst.dannet.query.operation :as op]) (:import [org.apache.jena.reasoner.rulesys FBRuleInfGraph])) @@ -216,12 +215,6 @@ coll) @weights)) -(defn sense-label-freqs - [g synset] - (->> (run-basic g op/synset-lemma-freqs {'?synset synset}) - (map (juxt '?lemma '?freq)) - (into {}))) - (defn other-entities "Restructure the `expanded-entity-result` as a mapping from resource->entity, not including the subject entity itself." @@ -240,7 +233,6 @@ (with-meta (->> (navigable-entity g result) (attach-blank-entities g subject)) (assoc (nav-meta g) - :sense-label->freq (sense-label-freqs g subject) :entities (other-entities result) :inferred (inferred-entity result (find-raw g subject)) ;; TODO: make more performant? diff --git a/src/main/dk/cst/dannet/query/operation.clj b/src/main/dk/cst/dannet/query/operation.clj index 61eecbe..5e34907 100644 --- a/src/main/dk/cst/dannet/query/operation.clj +++ b/src/main/dk/cst/dannet/query/operation.clj @@ -451,7 +451,7 @@ (def short-label-candidates (sparql - "SELECT (STR(?senseLabel) AS ?label) ?freq + "SELECT ?word (STR(?senseLabel) AS ?label) WHERE { ?synset rdf:type ontolex:LexicalConcept . FILTER(STRSTARTS(str(?synset), str(dn:))) . @@ -459,9 +459,6 @@ ?word ontolex:sense ?sense . FILTER(STRSTARTS(str(?word), str(dn:))) . ?sense rdfs:label ?senseLabel . - OPTIONAL { - ?word dns:ddoFrequency ?freq . - } }")) (def synset-long-short-labels diff --git a/src/main/dk/cst/dannet/query/operation/llm.clj b/src/main/dk/cst/dannet/query/operation/llm.clj index e780652..f65979b 100644 --- a/src/main/dk/cst/dannet/query/operation/llm.clj +++ b/src/main/dk/cst/dannet/query/operation/llm.clj @@ -358,6 +358,7 @@ [?canonical "et"]))) (into {})))) +;; TODO: do via a local file instead (DSL objects the inclusion of freq data) (def lemma->frequency "Frequencies originally sourced from DDO." (delay diff --git a/src/main/dk/cst/dannet/web/components.cljc b/src/main/dk/cst/dannet/web/components.cljc index ff7868c..771a9e1 100644 --- a/src/main/dk/cst/dannet/web/components.cljc +++ b/src/main/dk/cst/dannet/web/components.cljc @@ -62,8 +62,7 @@ (defn transform-val "Performs convenient transformations of `v`, optionally informed by `opts`." - ([v {:keys [attr-key entity details? sense-label->freq] :as opts - :or {sense-label->freq {}}}] + ([v {:keys [attr-key entity] :as opts}] (cond (rdf-datatype? v) (let [{:keys [uri value]} v] @@ -914,7 +913,7 @@ (rum/with-key (option v on-key-down) v)))]])) (rum/defc search-page - [{:keys [languages lemma search-results sense-label->freq details?] :as opts}] + [{:keys [languages lemma search-results details?] :as opts}] [:article.search [:header [:h1 (str "\"" lemma "\"")]] @@ -928,9 +927,8 @@ (assoc k->label k short-label) k->label)] - (rum/with-key (attr-val-table {:languages languages - :sense-label->freq sense-label->freq - :k->label k->label'} + (rum/with-key (attr-val-table {:languages languages + :k->label k->label'} entity) k))))]) @@ -1066,7 +1064,7 @@ [:option {:value "da"} "\uD83C\uDDE9\uD83C\uDDF0 Dansk"]]) (rum/defc page-shell < rum/reactive - [page {:keys [entity subject languages sense-label->freq entities] :as opts}] + [page {:keys [entity subject languages entities] :as opts}] (let [page-component (or (get pages page) (throw (ex-info (str "No component for page: " page) @@ -1076,8 +1074,6 @@ languages' (:languages state') comments (translate-comments languages') synset-weights (:synset-weights (meta entity)) - sense-label->freq (or sense-label->freq - (:sense-label->freq (meta entity))) details? (or (get state' :details?) (get opts :details?)) entity-label* (partial entity-label (if details? @@ -1089,7 +1085,6 @@ opts' (assoc (merge opts state') :comments comments :k->label (update-vals entities' entity-label*) - :sense-label->freq (or sense-label->freq {}) :synset-weights synset-weights) [prefix _ _] (resolve-names opts') prefix' (or prefix (some-> entity diff --git a/src/main/dk/cst/dannet/web/resources.clj b/src/main/dk/cst/dannet/web/resources.clj index 1860796..a3d1d3c 100644 --- a/src/main/dk/cst/dannet/web/resources.clj +++ b/src/main/dk/cst/dannet/web/resources.clj @@ -367,12 +367,9 @@ (-> ctx (update :response assoc :status 200 - :body (body {:languages languages - :lemma lemma - :sense-label->freq (->> (keys results) - (map (partial q/sense-label-freqs (:graph @db))) - (apply merge)) - :search-results results} + :body (body {:languages languages + :lemma lemma + :search-results results} page-meta)) (update-in [:response :headers] merge (assoc (x-headers page-meta)