From 744a61a6a87b88ca5a15ee300afbec16d80964da Mon Sep 17 00:00:00 2001 From: Luke Gessler Date: Sun, 26 Dec 2021 02:24:10 -0500 Subject: [PATCH] overhaul resolver for interlinear last perf issue: pagination speed, otherwise looking good. --- src/main/glam/algos/text.cljc | 80 ++--- .../ui/document/interlinear_editor.cljs | 300 ++++++++---------- src/main/glam/models/token_layer.cljc | 86 ++++- 3 files changed, 263 insertions(+), 203 deletions(-) diff --git a/src/main/glam/algos/text.cljc b/src/main/glam/algos/text.cljc index 7a018b5..9f9bd2f 100644 --- a/src/main/glam/algos/text.cljc +++ b/src/main/glam/algos/text.cljc @@ -169,7 +169,8 @@ based on the occurrence of the newline character. A token will be split if it contains a newline character in the output of this function, even though all copies will have the same ID, in order to facilitate display. (Newlines in - tokens are virtually unheard of, so this shouldn't be a big deal.)" + tokens are virtually unheard of, so this shouldn't be a big deal.) + Also add a :token/line attribute which is the 0-indexed line number of the token." [tokens-and-strings {:text/keys [body]}] (let [token-text (fn [{:token/keys [begin end] :as token}] (subs body begin end))] @@ -177,43 +178,46 @@ current-line [] head (first tokens-and-strings) tail (rest tokens-and-strings)] - (cond - (nil? head) - (conj accum-lines current-line) - - ;; string with newline - (and (string? head) (clojure.string/index-of head "\n")) - (let [newline-index (clojure.string/index-of head "\n") - current-line (conj current-line (subs head 0 newline-index))] - (recur (conj accum-lines current-line) - [] - (subs head (inc newline-index)) - tail)) - - ;; token with newline - (and (map? head) (clojure.string/index-of (token-text head) "\n")) - (let [newline-index (clojure.string/index-of (token-text head) "\n") - current-line (conj current-line (assoc head :token/end (+ newline-index (:token/begin head)))) - new-head (assoc head :token/begin (+ (:token/begin head) (inc newline-index))) - new-head-text (token-text new-head)] - (recur (conj accum-lines current-line) - [] - (if-not (empty? new-head-text) new-head (first tail)) - (if-not (empty? new-head-text) tail (rest tail)))) - - ;; plain string - (and (string? head) (not (empty? head))) - (recur accum-lines - (conj current-line head) - (first tail) - (rest tail)) - - ;; plain token - :else - (recur accum-lines - (conj current-line head) - (first tail) - (rest tail)))))) + (let [line-number (count accum-lines)] + (cond + (nil? head) + (conj accum-lines current-line) + + ;; string with newline + (and (string? head) (clojure.string/index-of head "\n")) + (let [newline-index (clojure.string/index-of head "\n") + current-line (conj current-line (subs head 0 newline-index))] + (recur (conj accum-lines current-line) + [] + (subs head (inc newline-index)) + tail)) + + ;; token with newline + (and (map? head) (clojure.string/index-of (token-text head) "\n")) + (let [newline-index (clojure.string/index-of (token-text head) "\n") + current-line (conj current-line (-> head + (assoc :token/end (+ newline-index (:token/begin head))) + (assoc :token/line line-number))) + new-head (assoc head :token/begin (+ (:token/begin head) (inc newline-index))) + new-head-text (token-text new-head)] + (recur (conj accum-lines current-line) + [] + (if-not (empty? new-head-text) new-head (first tail)) + (if-not (empty? new-head-text) tail (rest tail)))) + + ;; plain token + (map? head) + (recur accum-lines + (conj current-line (assoc head :token/line line-number)) + (first tail) + (rest tail)) + + ;; plain string + :else + (recur accum-lines + (conj current-line head) + (first tail) + (rest tail))))))) (defn add-untokenized-substrings "Takes a sequence of tokens, finds which parts of the text aren't covered by the tokens, and inserts diff --git a/src/main/glam/client/ui/document/interlinear_editor.cljs b/src/main/glam/client/ui/document/interlinear_editor.cljs index 4284efa..7db73ad 100644 --- a/src/main/glam/client/ui/document/interlinear_editor.cljs +++ b/src/main/glam/client/ui/document/interlinear_editor.cljs @@ -1,5 +1,6 @@ (ns glam.client.ui.document.interlinear-editor (:require [goog.object :as gobj] + [clojure.set :refer [difference]] [com.fulcrologic.fulcro.components :as c :refer [defsc]] [com.fulcrologic.fulcro.routing.dynamic-routing :as dr] [com.fulcrologic.fulcro.data-fetch :as df] @@ -17,6 +18,7 @@ [glam.client.ui.common.core :refer [loader]] [glam.client.ui.material-ui :as mui] [glam.algos.text :as ta] + [glam.models.token-layer :as tokl] [glam.models.span :as span] [glam.client.ui.global-snackbar :as snack])) @@ -24,10 +26,6 @@ (def ui-auto-sizer (interop/react-factory AutoSizer)) (def ui-autosize-input (interop/react-factory AutosizeInput)) -(defn get-token-span-layers [{:keys [span-layer-scopes]}] - (->> span-layer-scopes (filter #(= (second %) :token)) keys set)) -(defn get-sentence-span-layers [{:keys [span-layer-scopes]}] - (->> span-layer-scopes (filter #(= (second %) :sentence)) keys set)) ;; schema management -------------------------------------------------------------------------------- (defn get-line->token [lines] @@ -44,33 +42,6 @@ spans))) tokens)) -(defn reshape-into-token-grid - "Given a token layer's data tree, returns the sequence of tokens where each token has - been enriched with a :spans attribute containing a list of 2-tuples, where for each span - in the layer that is linked to the token and is configured as a token-level span layer, - the first item is the span layer's ID, and the second item is a sequence of all spans on - that layer that were linkde to this token. Example: - - (reshape-into-token-grid tl) - => - ({:token/id :tok2, - :token/value \"sentence\", - :token/begin 5, - :token/end 13, - :spans ([:sl1 (#:span{:id :s2, :value \"NN\", :tokens [#:token{:id :tok2}]})]) - ...) - " - [config {:token-layer/keys [tokens span-layers]}] - (let [token-span-layers (get-token-span-layers config)] - (for [{token-id :token/id :as token} tokens] - (assoc token - :spans - (for [{:span-layer/keys [id spans]} (filter #(token-span-layers (:span-layer/id %)) span-layers)] - (let [filtered-spans (filter (fn [{:span/keys [tokens]}] - (some #(= (:token/id %) token-id) tokens)) - spans)] - [id filtered-spans])))))) - (declare InterlinearEditor) (m/defmutation schema-update [{mark-ready? :ui/mark-ready? document-id :document/id}] @@ -101,62 +72,71 @@ of the interface." [{:keys [data-tree] document-id :document/id mark-ready? :ui/mark-ready?}] (action [{:keys [app]}] - (let [config (get-in data-tree [:document/project :project/config :editors :interlinear]) - token-level-layer-ids (get-token-span-layers config) - sentence-level-layer-ids (get-sentence-span-layers config) - text-layers (:document/text-layers data-tree) - batches (atom [])] + (let [text-layers (:document/text-layers data-tree) + batches (atom []) + get-snapshots (memoize (fn [token-layer] + (group-by :span/layer + (mapcat :token/spans + (:token-layer/columnar-tokens token-layer)))))] ;; Ensure that all token-level span layers have a span per-token (doseq [text-layer text-layers] - (doseq [{:token-layer/keys [tokens] :as token-layer} (:text-layer/token-layers text-layer)] - (doseq [{:span-layer/keys [id spans] :as sl} (filter #(token-level-layer-ids (:span-layer/id %)) - (:token-layer/span-layers token-layer))] + (doseq [token-layer (:text-layer/token-layers text-layer)] + ;; For each token-level span layer... + (doseq [{sl-id :span-layer/id} (:token-layer/token-span-layers token-layer)] (let [updates (atom [])] - (doseq [{:token/keys [id] :as token} (spanless-tokens tokens spans)] - (swap! updates conj - [:create {:span/value "" - :span/layer (:span-layer/id sl) - :span/tokens [id]}])) + ;; For each token... + (doseq [{:token/keys [id spans] :as tok} (:token-layer/columnar-tokens token-layer)] + ;; Check how many spans there are for this layer + (let [spans-for-layer (filter #(= (:span/layer %) sl-id) spans)] + ;; If there's no span for that token for that span layer, create one + (when (empty? spans-for-layer) + (swap! updates conj + [:create {:span/value "" + :span/layer sl-id + :span/tokens [id]}])) + ;; If there's more than one span for that token, delete the others + (when-let [{:span/keys [id]} (seq (rest spans-for-layer))] + (swap! updates conj + [:delete id])))) + (when-not (empty? @updates) (swap! batches conj - {:span-layer/id id - :span-snapshots spans + {:span-layer/id sl-id + :span-snapshots ((get-snapshots token-layer) sl-id) :updates @updates})))))) ;; Ensure that all sentence-level span layers have exactly one span per-token - (doseq [{:text-layer/keys [text] :as text-layer} text-layers] + (doseq [text-layer text-layers] (doseq [token-layer (:text-layer/token-layers text-layer)] - (let [tokens (sort-by :token/begin (reshape-into-token-grid config token-layer)) - lines-with-strings (-> tokens - (ta/add-untokenized-substrings text) - (ta/separate-into-lines text)) - lines (map #(filter :token/id %) lines-with-strings) - contentful-lines (filter some? (map-indexed #(if (empty? %2) nil %1) lines)) - tokens-by-line (get-line->token lines) - lines-by-token (get-token->line lines)] - (doseq [{:span-layer/keys [id spans] :as sl} (filter #(sentence-level-layer-ids (:span-layer/id %)) - (:token-layer/span-layers token-layer))] - (let [spans-with-lines (map (fn [span] - (assoc span :lines (set (map #(lines-by-token (:token/id %)) (:span/tokens span))))) - spans) - updates (atom []) - spans (atom spans-with-lines)] + (let [tokens (:token-layer/columnar-tokens token-layer) + contentful-lines (set (map :token/line tokens)) + token-index (into {} (map (fn [v] [(:token/id v) v]) tokens)) + line->tokens (group-by :token/line tokens)] + (doseq [{sl-id :span-layer/id} (:token-layer/sentence-span-layers token-layer)] + (let [updates (atom []) + spans (atom (filterv #(= (:span/layer %) sl-id) (:token-layer/sentence-level-spans token-layer)))] + + (log/info @spans) ;; Check 1: if some spans span multiple lines, choose just one line (the smallest) ;; TODO: just takes the lowest numbered line right now, probably we want to actually take the line ;; with the most tokens associated with the given span (reset! spans - (for [{lines :lines tokens :span/tokens id :span/id :as span} @spans] - (if (> (count lines) 1) - (let [min-line (apply min lines) - new-tokens (filterv #(= min-line (lines-by-token (:token/id %))) tokens)] - (swap! updates conj [:merge {:span/id id :span/tokens new-tokens}]) - (-> span - (assoc :lines #{min-line}) - (assoc :span/tokens new-tokens))) - span))) + (mapv (fn [{tokens :span/tokens id :span/id :as span}] + (let [lines (map (comp :token/line #(get token-index %)) tokens)] + (log/info lines span) + (if (> (count (set lines)) 1) + (let [min-line (apply min lines) + new-tokens (filterv #(= min-line (line->tokens (:token/id %))) tokens)] + (log/info new-tokens) + (swap! updates conj [:merge {:span/id id :span/tokens new-tokens}]) + (-> span + (assoc :lines #{min-line}) + (assoc :span/tokens new-tokens))) + (assoc span :lines (set lines))))) + @spans)) ;; Check 2: if a line has more than one span, choose the longest one and delete the rest (let [spans-by-line (into {} (for [line-number contentful-lines] @@ -174,19 +154,23 @@ ;; Check 3: if some spans only incompletely span a line, expand them (reset! spans - (for [{:span/keys [id tokens] :as span} @spans] - (let [line-num (lines-by-token (-> tokens first :token/id)) - line-tokens (tokens-by-line line-num)] - (if-not (= (set line-tokens) (set (map :token/id tokens))) - (do - (swap! updates conj [:merge {:span/id id :span/tokens (vec line-tokens)}]) - (assoc span :span/tokens (vec line-tokens))) - span)))) + (doall (for [{:span/keys [id tokens] :as span} @spans] + (let [line-num (-> span :lines first) + line-token-ids (mapv :token/id (line->tokens line-num))] + (if-not (= (set line-token-ids) (set tokens)) + (do + (swap! updates conj [:merge {:span/id id :span/tokens line-token-ids}]) + (assoc span :span/tokens line-token-ids)) + span))))) ;; Check 4: if some lines entirely lack a span, create one - (let [needs-span (filterv (fn [n] (not-any? #((:lines %) n) @spans)) contentful-lines)] + (let [covered-lines (set (mapcat #(->> (% :span/tokens) + (map token-index) + (map :token/line)) + @spans)) + needs-span (difference contentful-lines covered-lines)] (doseq [line-num needs-span] - (let [tokens-for-line (vec (tokens-by-line line-num)) + (let [tokens-for-line (vec (line->tokens line-num)) record {:span/value "" :span/tokens tokens-for-line}] (swap! updates conj [:create record]) (swap! spans conj (assoc record :span/id (tempid/tempid)))))) @@ -194,9 +178,8 @@ (when-not (empty? @updates) (swap! batches conj - {:span-layer/id id - :span-snapshots (map #(update % :span/tokens (fn [ts] (mapv :token/id ts))) - (:span-layer/spans sl)) + {:span-layer/id sl-id + :span-snapshots ((get-snapshots token-layer) sl-id) :updates @updates}))))))) (if-not (empty? @batches) @@ -227,32 +210,6 @@ :severity (if error? "error" "success")}) (swap! state assoc-in (conj ref :ui/dirty?) false)))))) -;; Query components -------------------------------------------------------------------------------- -(defsc Span - [this {:span/keys [id value]}] - {:query [:span/id :span/value :span/tokens :ui/focused? :ui/dirty?] - :pre-merge (fn [{:keys [data-tree current-normalized]}] - ;; If we're merging into a span that's currently being edited, keep its current value - (if (:ui/focused? current-normalized) - (merge current-normalized - data-tree - {:span/value (:span/value current-normalized)}) - (merge current-normalized - data-tree))) - :ident :span/id}) - -(defsc SpanLayer - [this {:span-layer/keys [id name spans]}] - {:query [:span-layer/id :span-layer/name - {:span-layer/spans (c/get-query Span)}] - ;; TODO maybe try to use pre-merge here to hold on to any existing tempid spans to resolve focus loss issue on temp spans - :ident :span-layer/id}) - -(defsc Token - [this {:token/keys [id value]}] - {:query [:token/id :token/value :token/begin :token/end :token/text] - :ident :token/id}) - ;; UI helpers -------------------------------------------------------------------------------- (defn merge-props-with-style [default extra] (if (:style extra) @@ -285,7 +242,15 @@ ;; Why? See reshape-into-token-grid. (defsc SentenceLevelSpan [this {:span/keys [id tokens] :as props}] {:ident :span/id - :query [:span/id :span/value :span/tokens] + :query [:span/id :span/value :span/tokens :span/layer] + :pre-merge (fn [{:keys [data-tree current-normalized]}] + ;; If we're merging into a span that's currently being edited, keep its current value + (if (:ui/focused? current-normalized) + (merge current-normalized + data-tree + {:span/value (:span/value current-normalized)}) + (merge current-normalized + data-tree))) ;; Need to use local state here because otherwise we lose focus when another user triggers a refresh. :initLocalState (fn [this props] {:value (or (:span/value props) "")}) :componentDidUpdate (fn [this prev-props _] @@ -321,11 +286,18 @@ :backgroundColor (if focused? "#e3ffe6" "transparent")}})))) (def ui-sentence-level-span (c/computed-factory SentenceLevelSpan {:keyfn (comp str :span/id)})) -(defsc SpanCell [this {:span/keys [id] :as props} {token-id :token/id - span-layer-id :span-layer/id - token-width :token-width :as cp}] +(defsc SpanCell [this {:span/keys [id] :as props} {token-id :token/id + token-width :token-width :as cp}] {:ident :span/id - :query [:span/id :span/value] + :query [:span/id :span/value :span/layer :span/tokens] + :pre-merge (fn [{:keys [data-tree current-normalized]}] + ;; If we're merging into a span that's currently being edited, keep its current value + (if (:ui/focused? current-normalized) + (merge current-normalized + data-tree + {:span/value (:span/value current-normalized)}) + (merge current-normalized + data-tree))) ;; Need to use local state here because otherwise we lose focus when another user triggers a refresh. :initLocalState (fn [this props] {:value (or (:span/value props) "")}) :componentDidUpdate (fn [this prev-props _] @@ -339,6 +311,7 @@ {:type "text" :value (if focused? value (:span/value props)) :onChange (fn [e] + (log/info "Changed") (c/set-state! this {:value (.-value (.-target e)) :ui/dirty? true})) :onFocus #(c/set-state! this (assoc (c/get-state this) :ui/focused? true)) @@ -347,7 +320,9 @@ (when dirty? (c/transact! this [(save-span {:span/id id :span/value value})] - {:on-result #(c/set-state! this (assoc (c/get-state this) :ui/dirty? false))}))) + {:on-result (fn [] + (log/info "Undirtying") + (c/set-state! this (assoc (c/get-state this) :ui/dirty? false)))}))) :inputStyle {:minWidth (or (and token-width (str (max (- token-width 4) 20) "px")) "20px") :display "inline-block" @@ -363,31 +338,30 @@ (def ui-span-cell (c/computed-factory SpanCell {:keyfn (comp str :span/id)})) ;; Here is where the real UI begins -(defsc TokenCell [this {:token/keys [id value] spans :spans :as props}] - {:query [:token/id :token/value :spans] +(defsc ColumnarToken [this {:token/keys [id value spans] :as props}] + {:query [:token/id :token/value :token/line {:token/spans (c/get-query SpanCell)}] :ident :token/id :initLocalState (fn [this props] - {:save-ref (fn [r] - (gobj/set this "token-ref" r))}) + {:save-ref #(gobj/set this "token-ref" %)}) :componentDidMount (fn [this prev-props _] ;; The first render sets up the ref but not in time for its width to be grabbed. ;; Force another render on mount so that we have a chance to properly get the width of the ref. (.forceUpdate this))} - (let [save-ref (c/get-state this :save-ref)] + (let [save-ref (c/get-state this :save-ref) + grouped-spans (group-by :span/layer spans)] (flex-col {:key id} (cell {} (dom/span {:ref save-ref} value)) - (mapv (fn [[sl-id spans]] + (mapv (fn [[sl spans]] (when (> (count spans) 1) - (log/warn (str "Found a token " id " with more than one associated span in " sl-id "." + (log/warn (str "Found a token " id " with more than one associated span in " (:span/layer (first spans)) "." " Currently, this is not supported, and only the first span will be used."))) (when (> (count spans) 0) (ui-span-cell (c/computed (first spans) - {:token/id id - :span-layer/id sl-id - :token-width (when-let [t (gobj/get this "token-ref")] - (.-width (.getBoundingClientRect t)))})))) - spans)))) -(def ui-token (c/factory TokenCell {:keyfn :token/id})) + {:token/id id + :token-width (when-let [t (gobj/get this "token-ref")] + (.-width (.getBoundingClientRect t)))})))) + grouped-spans)))) +(def ui-columnar-token (c/factory ColumnarToken {:keyfn :token/id})) ;; Where much of the work happens -------------------------------------------------------------------------------- ;; TODO debug @@ -398,35 +372,43 @@ (c/set-state! this (assoc state cache-key [invalidator-value (calc-fn)])) (second (cache-key state))))) +(defsc SpanLayer [_ _] + {:ident :span-layer/id + :query [:span-layer/id :span-layer/name]}) + (defsc TokenLayer - [this {:token-layer/keys [id name tokens span-layers] :ui/keys [page] :as token-layer} {:keys [text config]}] + [this + {:token-layer/keys [id name columnar-tokens sentence-level-spans token-span-layers sentence-span-layers] + :ui/keys [page] :as token-layer} + {:keys [text config]}] {:query [:token-layer/id :token-layer/name - {:token-layer/tokens (c/get-query Token)} - {:token-layer/span-layers (c/get-query SpanLayer)} + {:token-layer/columnar-tokens (c/get-query ColumnarToken)} + {:token-layer/sentence-level-spans (c/get-query SentenceLevelSpan)} + {:token-layer/token-span-layers (c/get-query SpanLayer)} + {:token-layer/sentence-span-layers (c/get-query SpanLayer)} :ui/page] - :initLocalState (fn [this props] - {}) :pre-merge (fn [{:keys [data-tree current-normalized]}] (assoc data-tree :ui/page 1 (or (:ui/page current-normalized) 1))) :componentWillUnmount (fn [this] (m/set-value! this :ui/page 1)) :ident :token-layer/id} - (let [tokens (sort-by :token/begin (reshape-into-token-grid config token-layer)) - lines-with-strings (-> tokens - (ta/add-untokenized-substrings text) - (ta/separate-into-lines text)) - lines (map #(filter :token/id %) lines-with-strings) - line->token-ids (get-line->token lines) - filtered-lines (map second (filter #(not-empty (line->token-ids (first %))) (map-indexed (fn [i v] [i v]) lines))) - line->token-ids (get-line->token filtered-lines) - token-span-layers (filter #((get-token-span-layers config) (:span-layer/id %)) span-layers) - sentence-span-layers (filter #((get-sentence-span-layers config) (:span-layer/id %)) span-layers) + ;; TODO remap lines to avoid empty lines + (let [contentful-lines (set (map :token/line columnar-tokens)) + line-remapping (into {} (map (fn [orig new] [orig new]) + (sort contentful-lines) + (range (count contentful-lines)))) + tokens-by-line (group-by (comp line-remapping :token/line) columnar-tokens) + + tokens-by-id (into {} (for [{id :token/id :as token} columnar-tokens] [id token])) + line-count (count contentful-lines) + sentence-level-spans-by-line (group-by (comp line-remapping :token/line tokens-by-id first :span/tokens) sentence-level-spans) render-line - (fn [[i line]] + (fn [[i tokens]] ;; For each line... (dom/div {:style {:backgroundColor (if (even? i) "#0055ff17" "white") :borderRadius 4 :padding "0.3em" - :marginBottom "1em"}} + :marginBottom "1em"} + :key (str "line" i)} ;; Token-level (flex-row {:style {:flexWrap "wrap" :marginBottom "20px"}} @@ -442,7 +424,7 @@ token-span-layers)) ;; Token columns - (mapv ui-token line)) + (mapv ui-columnar-token tokens)) ;; Sentence-level (flex-row {:style {:marginTop "12px"}} @@ -457,33 +439,27 @@ sentence-span-layers)) (flex-col {:key "spans"} - (for [{:span-layer/keys [id spans]} sentence-span-layers] - (let [tokens-for-line (set (line->token-ids i)) - span (some #(when (= tokens-for-line (set (map :token/id (:span/tokens %)))) %) spans)] - (when span - (ui-sentence-level-span span))))))))] - + (doall (for [sl-id (map :span-layer/id sentence-span-layers)] + (let [span (some #(when (= (:span/layer %) sl-id) %) (sentence-level-spans-by-line i))] + (when span + (ui-sentence-level-span span)))))))))] (let [page-count 10 pagination (fn [] - (when (> (count filtered-lines) page-count) + (when (> line-count page-count) (mui/pagination - {:count (js/Math.ceil (/ (count filtered-lines) page-count)) + {:count (js/Math.ceil (/ line-count page-count)) :page page :onChange #(m/set-integer! this :ui/page :value %2)})))] (mui/card {} (mui/card-header {:action (pagination) :title name}) - #_(mui/typography {:variant "h5" :style {:marginBottom "1em"}} name) - (mui/card-content {} - (mapv render-line (->> filtered-lines - (drop (* page-count (dec page))) - (take page-count) - (map-indexed (fn [i v] [i v]))))) + (mapv render-line (map (fn [i] [i (tokens-by-line i)]) + (range (* page-count (dec page)) (min line-count (* page-count page)))))) (mui/card-actions {} - (pagination)))))) + (pagination)))))) (def ui-token-layer (c/computed-factory TokenLayer {:keyfn :token-layer/id})) diff --git a/src/main/glam/models/token_layer.cljc b/src/main/glam/models/token_layer.cljc index c45d000..85ac266 100644 --- a/src/main/glam/models/token_layer.cljc +++ b/src/main/glam/models/token_layer.cljc @@ -12,7 +12,10 @@ #?(:clj [glam.xtdb.text :as txt]) #?(:clj [glam.xtdb.token-layer :as tokl]) #?(:clj [glam.xtdb.easy :as gxe]) - #?(:clj [glam.xtdb.token :as tok]))) + #?(:clj [glam.xtdb.token :as tok]) + #?(:clj [glam.xtdb.span :as span]) + #?(:clj [glam.xtdb.span-layer :as sl]) + [glam.algos.text :as ta])) (def token-layer-keys [:token-layer/name :token-layer/span-layers]) @@ -31,6 +34,37 @@ (def validator (fs/make-validator token-layer-valid)) + +(defn get-token-span-layers [layer-ids] + (->> layer-ids (filter #(= (second %) :token)) keys set)) +(defn get-sentence-span-layers [layer-ids] + (->> layer-ids (filter #(= (second %) :sentence)) keys set)) + +(defn reshape-into-token-grid + "Given a token layer's data tree with :token-layer/spans, returns the sequence of tokens + where each token has been enriched with a :token/spans attribute containing a list of 2-tuples, + where for each span in the layer that is linked to the token and is configured as a + token-level span layer, the first item is the span layer's ID, and the second item is a + sequence of all spans on that layer that were linkde to this token. Example: + + (reshape-into-token-grid tl) + => + ({:token/id :tok2, + :token/value \"sentence\", + :token/begin 5, + :token/end 13, + :token/spans (#:span{:id :s2, :value \"NN\", :tokens [#:token{:id :tok2}], :layer :sl1}) + ...) + " + [tokens spans] + (for [{token-id :token/id :as token} tokens] + (assoc token + :token/spans + (let [filtered-spans (filterv (fn [{:span/keys [tokens]}] + (some #(= % token-id) tokens)) + spans)] + filtered-spans)))) + ;; user -------------------------------------------------------------------------------- #?(:clj (pc/defresolver get-token-layer [{:keys [node]} {:token-layer/keys [id]}] @@ -88,6 +122,52 @@ {:token-layer/tokens tokens :token-layer/span-layers span-layers}))))) +#?(:clj + (pc/defresolver lorge-get-columnar-tokens [{:keys [node] :as env} {:token-layer/keys [id]}] + {::pc/input #{:token-layer/id} + ::pc/output [:token-layer/name + {:token-layer/columnar-tokens + [:token/id :token/begin :token/end :token/layer :token/value :token/line + {:token/spans [:span/id :span/value :span/layer :span/tokens]}]} + {:token-layer/sentence-level-spans [:span/id :span/layer :span/value :span/tokens]} + {:token-layer/token-span-layers [:span-layer/id :span-layer/name]} + {:token-layer/sentence-span-layers [:span-layer/id :span-layer/name]}] + ::pc/transform (ma/readable-required :token-layer/id)} + (when-let [[_ doc-id] (mc/try-get-document-ident env)] + (let [config (get-in (ffirst (xt/q (xt/db node) '{:find [?pc] + :where [[?doc :document/project ?prj] + [?prj :project/config ?pc]] + :in [?doc]} + doc-id)) + [:editors :interlinear]) + text (ffirst (xt/q (xt/db node) + '{:find [(pull ?txt [:text/id :text/body])] + :where [[?txt :text/document ?doc]] + :in [?doc]} + doc-id)) + tokens (mapv (fn [token] + (-> token (update :token/layer (fn [id] {:token-layer/id id})))) + (tok/get-tokens node id doc-id)) + token-sl-ids (get-token-span-layers (:span-layer-scopes config)) + + token-span-layers (mapv #(sl/get node %) token-sl-ids) + token-spans (mapcat #(span/get-spans node doc-id %) token-sl-ids) + + sorted-tokens (sort-by :token/begin (reshape-into-token-grid tokens token-spans)) + line-enriched-tokens (filter :token/id (apply concat (-> sorted-tokens + (ta/add-untokenized-substrings text) + (ta/separate-into-lines text)))) + + sentence-sl-ids (get-sentence-span-layers (:span-layer-scopes config)) + sentence-level-spans (mapcat #(span/get-spans node doc-id %) sentence-sl-ids) + sentence-span-layers (map #(sl/get node %) sentence-sl-ids)] + (log/info (vec sentence-span-layers)) + {:token-layer/name (:token-layer/name (gxe/entity node id)) + :token-layer/columnar-tokens line-enriched-tokens + :token-layer/sentence-level-spans sentence-level-spans + :token-layer/token-span-layers token-span-layers + :token-layer/sentence-span-layers sentence-span-layers})))) + #?(:clj (pc/defmutation whitespace-tokenize [{:keys [node] :as env} {:token-layer/keys [id] doc-id :document/id @@ -156,6 +236,6 @@ (server-message (str "Token layer " name " deleted"))))))) #?(:clj - (def token-layer-resolvers [get-token-layer get-tokens lorge-get-tokens create-token-layer save-token-layer - delete-token-layer whitespace-tokenize])) + (def token-layer-resolvers [get-token-layer get-tokens lorge-get-tokens lorge-get-columnar-tokens + create-token-layer save-token-layer delete-token-layer whitespace-tokenize]))