Merge pull request #51 from clulab/kwalcock/mathWithDualSum

Factor out the math
clulab · Nov 30, 2023 · 09218f7 · 09218f7
2 parents 32e07af + 70f44f2
commit 09218f7
Show file tree

Hide file tree

Showing 41 changed files with 2,158 additions and 465 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -1,4 +1,10 @@
++ **0.6.2** - Shade additional package internal to EJML
++ **0.6.1** - Shade EJML
++ **0.6.0** - Calculate with EJML rather than Breeze
++ **0.6.0** - Use sum instead of concat
++ **0.5.0** - Support Linux on aarch64
++ **0.5.0** - Isolate dependencies on models to the apps subproject
++ **0.4.0** - Account for maxTokens
 + **0.3.0** - Include some tokenizers as resources and only fall back to network if necessary
 + **0.2.0** - Faster version using JNI
-+ **0.1.0** - Initial version using the J4rs library
-+ 
++ **0.1.0** - Initial version using the J4rs library
diff --git a/README.md b/README.md
@@ -13,7 +13,7 @@ The libraries and models resulting from this project are incorporated into [proc
 To incorporate the encoder subproject as a Scala library dependency, either to access an existing model or because you've trained a new one with the Python code there, you'll need to add something like this to your `build.sbt` file:
 
 ```scala
-libraryDependencies += "org.clulab" %% "scala-transformers-encoder" % "0.3.0"
+libraryDependencies += "org.clulab" %% "scala-transformers-encoder" % "0.4.0"
 ```
 
 New models should generally be published to the [CLU Lab's artifactory server](https://artifactory.clulab.org/) so that they can be treated as library dependencies, although they can also be accessed as local files.  Two models have been generated and published.  They are incorporated into a Scala project with
@@ -36,7 +36,7 @@ Please see the [encoder README](encoder/README.md) for information about how to
 To use the tokenizer subproject as a Scala library dependency, you'll need to add something like this to your `build.sbt` file:
 
 ```scala
-libraryDependencies += "org.clulab" %% "scala-transformers-tokenizer" % "0.3.0"
+libraryDependencies += "org.clulab" %% "scala-transformers-tokenizer" % "0.4.0"
 ```
 
 See the [tokenizer README](tokenizer/README.md) for information about which tokenizers have already been packaged and how they are accessed.
diff --git a/apps/build.sbt b/apps/build.sbt
@@ -0,0 +1,19 @@
+name := "scala-transformers-apps"
+description := "Houses apps, particularly those having extra library dependencies"
+
+resolvers ++= Seq(
+  "Artifactory" at "https://artifactory.clulab.org/artifactory/sbt-release"
+)
+
+libraryDependencies ++= {
+  Seq(
+    "org.clulab"     % "deberta-onnx-model" % "0.2.0",
+    "org.clulab"     % "electra-onnx-model" % "0.2.0",
+    "org.clulab"     % "roberta-onnx-model" % "0.2.0",
+    "org.scalatest" %% "scalatest"          % "3.2.15" % "test"
+  )
+}
+
+fork := true
+
+// assembly / mainClass := Some("com.keithalcock.tokenizer.scalapy.apps.ExampleApp")
diff --git a/...ormers/encoder/apps/BlasInstanceApp.scala → ...ansformers/apps/BlasInstanceApp.scala.txt b/...ormers/encoder/apps/BlasInstanceApp.scala → ...ansformers/apps/BlasInstanceApp.scala.txt
@@ -1,4 +1,4 @@
-package org.clulab.scala_transformers.encoder.apps
+package org.clulab.scala_transformers.apps
 
 import dev.ludovic.netlib.blas.{BLAS, JavaBLAS, NativeBLAS}
 

diff --git a/...encoder/apps/LoadExampleFromFileApp.scala → ...formers/apps/LoadExampleFromFileApp.scala b/...encoder/apps/LoadExampleFromFileApp.scala → ...formers/apps/LoadExampleFromFileApp.scala
@@ -1,11 +1,16 @@
-package org.clulab.scala_transformers.encoder.apps
+package org.clulab.scala_transformers.apps
 
 import org.clulab.scala_transformers.encoder.{Encoder, TokenClassifierFactoryFromFiles, TokenClassifierLayout}
 import org.clulab.scala_transformers.tokenizer.LongTokenization
 import org.clulab.scala_transformers.tokenizer.jni.ScalaJniTokenizer
 
 object LoadExampleFromFileApp extends App {
-  val baseName = args.lift(0).getOrElse("../tcmodel")
+  // Choose one of these.
+  val defaultBaseName = "../models/microsoft_deberta_v3_base_mtl/avg_export"
+  // val defaultBaseName = "../models/google_electra_small_discriminator_mtl/avg_export"
+  // val defaultBaseName = "../models/roberta_base_mtl/avg_export"
+
+  val baseName = args.lift(0).getOrElse(defaultBaseName)
   val tokenClassifierLayout = new TokenClassifierLayout(baseName)
   val tokenClassifierFactory = new TokenClassifierFactoryFromFiles(tokenClassifierLayout)
   val words = Array("EU", "rejects", "German", "call", "to", "boycott", "British", "lamb", ".")

diff --git a/...der/apps/LoadExampleFromResourceApp.scala → ...ers/apps/LoadExampleFromResourceApp.scala b/...der/apps/LoadExampleFromResourceApp.scala → ...ers/apps/LoadExampleFromResourceApp.scala
@@ -1,11 +1,15 @@
-package org.clulab.scala_transformers.encoder.apps
+package org.clulab.scala_transformers.apps
 
 import org.clulab.scala_transformers.encoder.{Encoder, TokenClassifierFactoryFromResources, TokenClassifierLayout}
 import org.clulab.scala_transformers.tokenizer.LongTokenization
 import org.clulab.scala_transformers.tokenizer.jni.ScalaJniTokenizer
 
 object LoadExampleFromResourceApp extends App {
-  val baseName = "/org/clulab/scala_transformers/models/roberta_base_mtl/avg_export"
+  // Choose one of these.
+  val baseName = "/org/clulab/scala_transformers/models/microsoft_deberta_v3_base_mtl/avg_export"
+  // val baseName = "/org/clulab/scala_transformers/models/google_electra_small_discriminator_mtl/avg_export"
+  // val baseName = "/org/clulab/scala_transformers/models/roberta_base_mtl/avg_export"
+
   val tokenClassifierLayout = new TokenClassifierLayout(baseName)
   val tokenClassifierFactory = new TokenClassifierFactoryFromResources(tokenClassifierLayout)
   val words = Array("EU", "rejects", "German", "call", "to", "boycott", "British", "lamb", ".")

diff --git a/...oder/apps/TokenClassifierExampleApp.scala → ...mers/apps/TokenClassifierExampleApp.scala b/...oder/apps/TokenClassifierExampleApp.scala → ...mers/apps/TokenClassifierExampleApp.scala
@@ -1,18 +1,15 @@
-package org.clulab.scala_transformers.encoder.apps
+package org.clulab.scala_transformers.apps
 
 import org.clulab.scala_transformers.encoder.TokenClassifier
 
-/*
-import java.io.File
-
-import org.clulab.scala_transformers.tokenizer.jni.ScalaJniTokenizer
-import org.clulab.scala_transformers.tokenizer.LongTokenization
-*/
-
 object TokenClassifierExampleApp extends App {
-  val tokenClassifier = TokenClassifier.fromFiles("/home/msurdeanu/github/scala-transformers/roberta-base-mtl/avg_export")
-  //val tokenClassifier = TokenClassifier.fromFiles("../microsoft-deberta-v3-base-mtl/avg_export")
-  //val tokenClassifier = TokenClassifier.fromResources("/org/clulab/scala_transformers/models/microsoft_deberta_v3_base_mtl/avg_export")
+  // Choose one of these.
+  val tokenClassifier = TokenClassifier.fromFiles("../models/microsoft_deberta_v3_base_mtl/avg_export")
+  // val tokenClassifier = TokenClassifier.fromResources("/org/clulab/scala_transformers/models/microsoft_deberta_v3_base_mtl/avg_export")
+  // val tokenClassifier = TokenClassifier.fromFiles("../models/google_electra_small_discriminator_mtl/avg_export")
+  // val tokenClassifier = TokenClassifier.fromResources("/org/clulab/scala_transformers/models/google_electra_small_discriminator_mtl/avg_export")
+  // val tokenClassifier = TokenClassifier.fromFiles("../models/roberta_base_mtl/avg_export")
+  // val tokenClassifier = TokenClassifier.fromResources("/org/clulab/scala_transformers/models/roberta_base_mtl/avg_export")
 
   //val words = Seq("EU", "rejects", "German", "call", "to", "boycott", "British", "lamb", ".")
   val words = Seq("John", "Doe", "went", "to", "China", ".")

diff --git a/apps/src/main/scala/org/clulab/scala_transformers/apps/TokenClassifierTimerApp.scala b/apps/src/main/scala/org/clulab/scala_transformers/apps/TokenClassifierTimerApp.scala
@@ -0,0 +1,130 @@
+package org.clulab.scala_transformers.apps
+
+import org.clulab.scala_transformers.common.Timers
+import org.clulab.scala_transformers.encoder.{EncoderMaxTokensRuntimeException, TokenClassifier}
+import org.clulab.scala_transformers.tokenizer.LongTokenization
+
+import scala.io.Source
+
+object TokenClassifierTimerApp extends App {
+
+  class TimedTokenClassifier(tokenClassifier: TokenClassifier) extends TokenClassifier(
+    tokenClassifier.encoder, tokenClassifier.maxTokens, tokenClassifier.tasks, tokenClassifier.tokenizer
+  ) {
+    val tokenizeTimer = Timers.getOrNew("Tokenizer")
+    val forwardTimer = Timers.getOrNew("Encoder.forward")
+    val predictTimers = tokenClassifier.tasks.indices.map { index =>
+      val name = tasks(index).name
+
+      Timers.getOrNew(s"Encoder.predict $index\t$name")
+    }
+
+    // NOTE: This should be copied from the base class and then instrumented with timers.
+    override def predictWithScores(words: Seq[String], headTaskName: String = "Deps Head"): Array[Array[Array[(String, Float)]]] = {
+      // This condition must be met in order for allLabels to be filled properly without nulls.
+      // The condition is not checked at runtime!
+      // if (tasks.exists(_.dual))
+      //   require(tasks.count(task => !task.dual && task.name == headTaskName) == 1)
+
+      // tokenize to subword tokens
+      val tokenization = tokenizeTimer.time {
+        LongTokenization(tokenizer.tokenize(words.toArray))
+      }
+      val inputIds = tokenization.tokenIds
+      val wordIds = tokenization.wordIds
+      val tokens = tokenization.tokens
+
+      if (inputIds.length > maxTokens) {
+        throw new EncoderMaxTokensRuntimeException(s"Encoder error: the following text contains more tokens than the maximum number accepted by this encoder ($maxTokens): ${tokens.mkString(", ")}")
+      }
+
+      // run the sentence through the transformer encoder
+      val encOutput = forwardTimer.time {
+        encoder.forward(inputIds)
+      }
+
+      // outputs for all tasks stored here: task x tokens in sentence x scores per token
+      val allLabels = new Array[Array[Array[(String, Float)]]](tasks.length)
+      // all heads predicted for every token
+      // dimensions: token x heads
+      var heads: Option[Array[Array[Int]]] = None
+
+      // now generate token label predictions for all primary tasks (not dual!)
+      for (i <- tasks.indices) {
+        if (!tasks(i).dual) {
+          val tokenLabels = predictTimers(i).time {
+            tasks(i).predictWithScores(encOutput, None, None)
+          }
+          val wordLabels = TokenClassifier.mapTokenLabelsAndScoresToWords(tokenLabels, tokenization.wordIds)
+          allLabels(i) = wordLabels
+
+          // if this is the task that predicts head positions, then save them for the dual tasks
+          // we save all the heads predicted for each token
+          if (tasks(i).name == headTaskName) {
+            heads = Some(tokenLabels.map(_.map(_._1.toInt)))
+          }
+        }
+      }
+
+      // generate outputs for the dual tasks, if heads were predicted by one of the primary tasks
+      // the dual task(s) must be aligned with the heads.
+      //   that is, we predict the top label for each of the head candidates
+      if (heads.isDefined) {
+        //println("Tokens:    " + tokens.mkString(", "))
+        //println("Heads:\n\t" + heads.get.map(_.slice(0, 3).mkString(", ")).mkString("\n\t"))
+        //println("Masks:     " + TokenClassifier.mkTokenMask(wordIds).mkString(", "))
+        val masks = Some(TokenClassifier.mkTokenMask(wordIds))
+
+        for (i <- tasks.indices) {
+          if (tasks(i).dual) {
+            val tokenLabels = predictTimers(i).time {
+              tasks(i).predictWithScores(encOutput, heads, masks)
+            }
+            val wordLabels = TokenClassifier.mapTokenLabelsAndScoresToWords(tokenLabels, tokenization.wordIds)
+            allLabels(i) = wordLabels
+          }
+        }
+      }
+
+      allLabels
+    }
+  }
+
+  val verbose = false
+  val fileName = args.lift(0).getOrElse("../corpora/sentences/sentences.txt")
+  // Choose one of these.
+  val untimedTokenClassifier = TokenClassifier.fromFiles("../models/microsoft_deberta_v3_base_mtl/avg_export")
+  // val untimedTokenClassifier = TokenClassifier.fromFiles("../models/google_electra_small_discriminator_mtl/avg_export")
+  // val untimedTokenClassifier = TokenClassifier.fromFiles("../models/roberta_base_mtl/avg_export")
+
+  val tokenClassifier = new TimedTokenClassifier(untimedTokenClassifier)
+  val lines = {
+    val source = Source.fromFile(fileName)
+    val lines = source.getLines().take(100).toArray
+
+    source.close
+    lines
+  }
+  val elapsedTimer = Timers.getOrNew("Elapsed")
+
+  elapsedTimer.time {
+    lines.zipWithIndex/*.par*/.foreach { case (line, index) =>
+      println(s"$index $line")
+      if (index != 1382) {
+        val words = line.split(" ").toSeq
+        val allLabelSeqs = tokenClassifier.predictWithScores(words)
+
+        if (verbose) {
+          println(s"Words: ${words.mkString(", ")}")
+          for (layer <- allLabelSeqs) {
+            val words = layer.map(_.head) // Collapse the next layer by just taking the head.
+            val wordLabels = words.map(_._1)
+
+            println(s"Labels: ${wordLabels.mkString(", ")}")
+          }
+        }
+      }
+    }
+  }
+  Timers.summarize()
+}
diff --git a/build.sbt b/build.sbt
@@ -1,10 +1,10 @@
 val scala211 = "2.11.12" // up to 2.11.12
 val scala212 = "2.12.18" // up to 2.12.18
-val scala213 = "2.13.11" // up to 2.13.11
+val scala213 = "2.13.12" // up to 2.13.12
 val scala30  = "3.0.2"   // up to 3.0.2
 val scala31  = "3.1.3"   // up to 3.1.3
 val scala32  = "3.2.2"   // up to 3.2.2
-val scala33  = "3.3.0"   // up to 3.3.0
+val scala33  = "3.3.1"   // up to 3.3.1
 val scala3   = scala31
 
 // Breeze 1.1+ is not available for scala211.
@@ -14,7 +14,13 @@ ThisBuild / scalaVersion := scala212
 name := "scala-transformers"
 
 lazy val root = (project in file("."))
-  .aggregate(common, tokenizer, encoder)
+  .aggregate(apps, common, tokenizer, encoder)
+  .settings(
+    publish / skip := true
+  )
+
+lazy val apps = project
+  .dependsOn(encoder)
   .settings(
     publish / skip := true
   )

diff --git a/...ab/scala_transformers/LibraryLoader.scala → ...a_transformers/common/LibraryLoader.scala b/...ab/scala_transformers/LibraryLoader.scala → ...a_transformers/common/LibraryLoader.scala
@@ -1,4 +1,4 @@
-package org.clulab.scala_transformers.tokenizer
+package org.clulab.scala_transformers.common
 
 import java.io.{FileInputStream, FileOutputStream}
 
@@ -33,6 +33,8 @@ object LibraryLoader {
     else text
 
   def isMac: Boolean = System.getProperty("os.name").startsWith("Mac ")
+
+  def isWin: Boolean = System.getProperty("os.name").startsWith("Win")
 
   def isApple: Boolean = System.getProperty("os.arch") == "aarch64"
 
@@ -41,7 +43,7 @@ object LibraryLoader {
     // The Mac reports a libname ending with .dylib, but Java needs .jnilib instead.
     val jniName = replaceSuffix(libName, ".dylib", ".jnilib");
     val resourceName =
-        if (!isMac) jniName
+        if (isWin) jniName
         else if (isApple) "apple-" + jniName
         else "intel-" + jniName
 

diff --git a/...la_transformers/encoder/timer/Timer.scala → ...lab/scala_transformers/common/Timer.scala b/...la_transformers/encoder/timer/Timer.scala → ...lab/scala_transformers/common/Timer.scala
@@ -1,4 +1,4 @@
-package org.clulab.scala_transformers.encoder.timer
+package org.clulab.scala_transformers.common
 
 import scala.collection.mutable.{HashMap => MutableHashMap}
 

diff --git a/encoder/build.sbt b/encoder/build.sbt
@@ -2,24 +2,41 @@ name := "scala-transformers-encoder"
 description := "Provides a Scala interface to huggingface encoders"
 
 resolvers ++= Seq(
-  "Artifactory" at "https://artifactory.clulab.org/artifactory/sbt-release"
+//  "Artifactory" at "https://artifactory.clulab.org/artifactory/sbt-release"
 )
 
 libraryDependencies ++= {
   val breezeVersion = CrossVersion.partialVersion(scalaVersion.value) match {
     case Some((2, minor)) if minor < 12 => "1.0"
     case _ => "2.1.0"
   }
+  val ejmlVersion = "0.41" // Use this older version for Java 8.
 
   Seq(
-    "org.scalanlp"              %% "breeze"             % breezeVersion,
+    // Choose one of these.
+    /// "org.apache.commons"         % "commons-math3"      % "3.6.1",
+    "org.ejml"                   % "ejml-core"          % ejmlVersion,
+    "org.ejml"                   % "ejml-fdense"        % ejmlVersion,
+    "org.ejml"                   % "ejml-simple"        % ejmlVersion,
+    // "org.scalanlp"              %% "breeze"             % breezeVersion,
+
     "com.microsoft.onnxruntime"  % "onnxruntime"        % "1.13.1",
-    //"org.clulab"                 % "roberta-onnx-model" % "0.0.2",
-    "org.clulab"                 % "deberta-onnx-model" % "0.0.3",
     "org.slf4j"                  % "slf4j-api"          % "1.7.10"
   )
 }
 
 fork := true
 
 // assembly / mainClass := Some("com.keithalcock.tokenizer.scalapy.apps.ExampleApp")
+
+enablePlugins(ShadingPlugin)
+shadedDependencies ++= Set(
+  "org.ejml" % "ejml-core"   % "<ignored>",
+  "org.ejml" % "ejml-fdense" % "<ignored>",
+  "org.ejml" % "ejml-simple" % "<ignored>"
+)
+shadingRules ++= Seq(
+  ShadingRule.moveUnder("org.ejml", "org.clulab.shaded"),
+  ShadingRule.moveUnder("pabeles.concurrency", "org.clulab.shaded")
+)
+validNamespaces ++= Set("org", "org.clulab")
diff --git a/...transformers/encoder/BreezeExamples.scala → ...sformers/encoder/BreezeExamples.scala.txt b/...transformers/encoder/BreezeExamples.scala → ...sformers/encoder/BreezeExamples.scala.txt
@@ -1,10 +1,10 @@
 package org.clulab.scala_transformers.encoder
 
-import breeze.linalg._ 
-import BreezeUtils._
+import breeze.linalg._
+import org.clulab.scala_transformers.encoder.math.BreezeMath
 
 object BreezeExamples extends App {
-  val m = mkRowMatrix[Float](Array(Array(1f, 2f), Array(3f, 4f)))
+  val m = BreezeMath.mkMatrixFromRows(Array(Array(1f, 2f), Array(3f, 4f)))
   println(m)
 
   println("Row 0: " + m(0, ::))
@@ -19,6 +19,6 @@ object BreezeExamples extends App {
 
   dm(0, ::) :+= cm.t
   dm(1, ::) :+= DenseVector.vertcat(m(1, ::).t, m(1, ::).t).t
-  println("After chaning dm:")
+  println("After changing dm:")
   println(dm)
 }