Skip to content

Commit

Permalink
Merge pull request #51 from clulab/kwalcock/mathWithDualSum
Browse files Browse the repository at this point in the history
Factor out the math
  • Loading branch information
kwalcock authored Nov 30, 2023
2 parents 32e07af + 70f44f2 commit 09218f7
Show file tree
Hide file tree
Showing 41 changed files with 2,158 additions and 465 deletions.
10 changes: 8 additions & 2 deletions CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
+ **0.6.2** - Shade additional package internal to EJML
+ **0.6.1** - Shade EJML
+ **0.6.0** - Calculate with EJML rather than Breeze
+ **0.6.0** - Use sum instead of concat
+ **0.5.0** - Support Linux on aarch64
+ **0.5.0** - Isolate dependencies on models to the apps subproject
+ **0.4.0** - Account for maxTokens
+ **0.3.0** - Include some tokenizers as resources and only fall back to network if necessary
+ **0.2.0** - Faster version using JNI
+ **0.1.0** - Initial version using the J4rs library
+
+ **0.1.0** - Initial version using the J4rs library
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ The libraries and models resulting from this project are incorporated into [proc
To incorporate the encoder subproject as a Scala library dependency, either to access an existing model or because you've trained a new one with the Python code there, you'll need to add something like this to your `build.sbt` file:

```scala
libraryDependencies += "org.clulab" %% "scala-transformers-encoder" % "0.3.0"
libraryDependencies += "org.clulab" %% "scala-transformers-encoder" % "0.4.0"
```

New models should generally be published to the [CLU Lab's artifactory server](https://artifactory.clulab.org/) so that they can be treated as library dependencies, although they can also be accessed as local files. Two models have been generated and published. They are incorporated into a Scala project with
Expand All @@ -36,7 +36,7 @@ Please see the [encoder README](encoder/README.md) for information about how to
To use the tokenizer subproject as a Scala library dependency, you'll need to add something like this to your `build.sbt` file:

```scala
libraryDependencies += "org.clulab" %% "scala-transformers-tokenizer" % "0.3.0"
libraryDependencies += "org.clulab" %% "scala-transformers-tokenizer" % "0.4.0"
```

See the [tokenizer README](tokenizer/README.md) for information about which tokenizers have already been packaged and how they are accessed.
19 changes: 19 additions & 0 deletions apps/build.sbt
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
name := "scala-transformers-apps"
description := "Houses apps, particularly those having extra library dependencies"

resolvers ++= Seq(
"Artifactory" at "https://artifactory.clulab.org/artifactory/sbt-release"
)

libraryDependencies ++= {
Seq(
"org.clulab" % "deberta-onnx-model" % "0.2.0",
"org.clulab" % "electra-onnx-model" % "0.2.0",
"org.clulab" % "roberta-onnx-model" % "0.2.0",
"org.scalatest" %% "scalatest" % "3.2.15" % "test"
)
}

fork := true

// assembly / mainClass := Some("com.keithalcock.tokenizer.scalapy.apps.ExampleApp")
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package org.clulab.scala_transformers.encoder.apps
package org.clulab.scala_transformers.apps

import dev.ludovic.netlib.blas.{BLAS, JavaBLAS, NativeBLAS}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
package org.clulab.scala_transformers.encoder.apps
package org.clulab.scala_transformers.apps

import org.clulab.scala_transformers.encoder.{Encoder, TokenClassifierFactoryFromFiles, TokenClassifierLayout}
import org.clulab.scala_transformers.tokenizer.LongTokenization
import org.clulab.scala_transformers.tokenizer.jni.ScalaJniTokenizer

object LoadExampleFromFileApp extends App {
val baseName = args.lift(0).getOrElse("../tcmodel")
// Choose one of these.
val defaultBaseName = "../models/microsoft_deberta_v3_base_mtl/avg_export"
// val defaultBaseName = "../models/google_electra_small_discriminator_mtl/avg_export"
// val defaultBaseName = "../models/roberta_base_mtl/avg_export"

val baseName = args.lift(0).getOrElse(defaultBaseName)
val tokenClassifierLayout = new TokenClassifierLayout(baseName)
val tokenClassifierFactory = new TokenClassifierFactoryFromFiles(tokenClassifierLayout)
val words = Array("EU", "rejects", "German", "call", "to", "boycott", "British", "lamb", ".")
Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
package org.clulab.scala_transformers.encoder.apps
package org.clulab.scala_transformers.apps

import org.clulab.scala_transformers.encoder.{Encoder, TokenClassifierFactoryFromResources, TokenClassifierLayout}
import org.clulab.scala_transformers.tokenizer.LongTokenization
import org.clulab.scala_transformers.tokenizer.jni.ScalaJniTokenizer

object LoadExampleFromResourceApp extends App {
val baseName = "/org/clulab/scala_transformers/models/roberta_base_mtl/avg_export"
// Choose one of these.
val baseName = "/org/clulab/scala_transformers/models/microsoft_deberta_v3_base_mtl/avg_export"
// val baseName = "/org/clulab/scala_transformers/models/google_electra_small_discriminator_mtl/avg_export"
// val baseName = "/org/clulab/scala_transformers/models/roberta_base_mtl/avg_export"

val tokenClassifierLayout = new TokenClassifierLayout(baseName)
val tokenClassifierFactory = new TokenClassifierFactoryFromResources(tokenClassifierLayout)
val words = Array("EU", "rejects", "German", "call", "to", "boycott", "British", "lamb", ".")
Expand Down
Original file line number Diff line number Diff line change
@@ -1,18 +1,15 @@
package org.clulab.scala_transformers.encoder.apps
package org.clulab.scala_transformers.apps

import org.clulab.scala_transformers.encoder.TokenClassifier

/*
import java.io.File
import org.clulab.scala_transformers.tokenizer.jni.ScalaJniTokenizer
import org.clulab.scala_transformers.tokenizer.LongTokenization
*/

object TokenClassifierExampleApp extends App {
val tokenClassifier = TokenClassifier.fromFiles("/home/msurdeanu/github/scala-transformers/roberta-base-mtl/avg_export")
//val tokenClassifier = TokenClassifier.fromFiles("../microsoft-deberta-v3-base-mtl/avg_export")
//val tokenClassifier = TokenClassifier.fromResources("/org/clulab/scala_transformers/models/microsoft_deberta_v3_base_mtl/avg_export")
// Choose one of these.
val tokenClassifier = TokenClassifier.fromFiles("../models/microsoft_deberta_v3_base_mtl/avg_export")
// val tokenClassifier = TokenClassifier.fromResources("/org/clulab/scala_transformers/models/microsoft_deberta_v3_base_mtl/avg_export")
// val tokenClassifier = TokenClassifier.fromFiles("../models/google_electra_small_discriminator_mtl/avg_export")
// val tokenClassifier = TokenClassifier.fromResources("/org/clulab/scala_transformers/models/google_electra_small_discriminator_mtl/avg_export")
// val tokenClassifier = TokenClassifier.fromFiles("../models/roberta_base_mtl/avg_export")
// val tokenClassifier = TokenClassifier.fromResources("/org/clulab/scala_transformers/models/roberta_base_mtl/avg_export")

//val words = Seq("EU", "rejects", "German", "call", "to", "boycott", "British", "lamb", ".")
val words = Seq("John", "Doe", "went", "to", "China", ".")
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
package org.clulab.scala_transformers.apps

import org.clulab.scala_transformers.common.Timers
import org.clulab.scala_transformers.encoder.{EncoderMaxTokensRuntimeException, TokenClassifier}
import org.clulab.scala_transformers.tokenizer.LongTokenization

import scala.io.Source

object TokenClassifierTimerApp extends App {

class TimedTokenClassifier(tokenClassifier: TokenClassifier) extends TokenClassifier(
tokenClassifier.encoder, tokenClassifier.maxTokens, tokenClassifier.tasks, tokenClassifier.tokenizer
) {
val tokenizeTimer = Timers.getOrNew("Tokenizer")
val forwardTimer = Timers.getOrNew("Encoder.forward")
val predictTimers = tokenClassifier.tasks.indices.map { index =>
val name = tasks(index).name

Timers.getOrNew(s"Encoder.predict $index\t$name")
}

// NOTE: This should be copied from the base class and then instrumented with timers.
override def predictWithScores(words: Seq[String], headTaskName: String = "Deps Head"): Array[Array[Array[(String, Float)]]] = {
// This condition must be met in order for allLabels to be filled properly without nulls.
// The condition is not checked at runtime!
// if (tasks.exists(_.dual))
// require(tasks.count(task => !task.dual && task.name == headTaskName) == 1)

// tokenize to subword tokens
val tokenization = tokenizeTimer.time {
LongTokenization(tokenizer.tokenize(words.toArray))
}
val inputIds = tokenization.tokenIds
val wordIds = tokenization.wordIds
val tokens = tokenization.tokens

if (inputIds.length > maxTokens) {
throw new EncoderMaxTokensRuntimeException(s"Encoder error: the following text contains more tokens than the maximum number accepted by this encoder ($maxTokens): ${tokens.mkString(", ")}")
}

// run the sentence through the transformer encoder
val encOutput = forwardTimer.time {
encoder.forward(inputIds)
}

// outputs for all tasks stored here: task x tokens in sentence x scores per token
val allLabels = new Array[Array[Array[(String, Float)]]](tasks.length)
// all heads predicted for every token
// dimensions: token x heads
var heads: Option[Array[Array[Int]]] = None

// now generate token label predictions for all primary tasks (not dual!)
for (i <- tasks.indices) {
if (!tasks(i).dual) {
val tokenLabels = predictTimers(i).time {
tasks(i).predictWithScores(encOutput, None, None)
}
val wordLabels = TokenClassifier.mapTokenLabelsAndScoresToWords(tokenLabels, tokenization.wordIds)
allLabels(i) = wordLabels

// if this is the task that predicts head positions, then save them for the dual tasks
// we save all the heads predicted for each token
if (tasks(i).name == headTaskName) {
heads = Some(tokenLabels.map(_.map(_._1.toInt)))
}
}
}

// generate outputs for the dual tasks, if heads were predicted by one of the primary tasks
// the dual task(s) must be aligned with the heads.
// that is, we predict the top label for each of the head candidates
if (heads.isDefined) {
//println("Tokens: " + tokens.mkString(", "))
//println("Heads:\n\t" + heads.get.map(_.slice(0, 3).mkString(", ")).mkString("\n\t"))
//println("Masks: " + TokenClassifier.mkTokenMask(wordIds).mkString(", "))
val masks = Some(TokenClassifier.mkTokenMask(wordIds))

for (i <- tasks.indices) {
if (tasks(i).dual) {
val tokenLabels = predictTimers(i).time {
tasks(i).predictWithScores(encOutput, heads, masks)
}
val wordLabels = TokenClassifier.mapTokenLabelsAndScoresToWords(tokenLabels, tokenization.wordIds)
allLabels(i) = wordLabels
}
}
}

allLabels
}
}

val verbose = false
val fileName = args.lift(0).getOrElse("../corpora/sentences/sentences.txt")
// Choose one of these.
val untimedTokenClassifier = TokenClassifier.fromFiles("../models/microsoft_deberta_v3_base_mtl/avg_export")
// val untimedTokenClassifier = TokenClassifier.fromFiles("../models/google_electra_small_discriminator_mtl/avg_export")
// val untimedTokenClassifier = TokenClassifier.fromFiles("../models/roberta_base_mtl/avg_export")

val tokenClassifier = new TimedTokenClassifier(untimedTokenClassifier)
val lines = {
val source = Source.fromFile(fileName)
val lines = source.getLines().take(100).toArray

source.close
lines
}
val elapsedTimer = Timers.getOrNew("Elapsed")

elapsedTimer.time {
lines.zipWithIndex/*.par*/.foreach { case (line, index) =>
println(s"$index $line")
if (index != 1382) {
val words = line.split(" ").toSeq
val allLabelSeqs = tokenClassifier.predictWithScores(words)

if (verbose) {
println(s"Words: ${words.mkString(", ")}")
for (layer <- allLabelSeqs) {
val words = layer.map(_.head) // Collapse the next layer by just taking the head.
val wordLabels = words.map(_._1)

println(s"Labels: ${wordLabels.mkString(", ")}")
}
}
}
}
}
Timers.summarize()
}
12 changes: 9 additions & 3 deletions build.sbt
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
val scala211 = "2.11.12" // up to 2.11.12
val scala212 = "2.12.18" // up to 2.12.18
val scala213 = "2.13.11" // up to 2.13.11
val scala213 = "2.13.12" // up to 2.13.12
val scala30 = "3.0.2" // up to 3.0.2
val scala31 = "3.1.3" // up to 3.1.3
val scala32 = "3.2.2" // up to 3.2.2
val scala33 = "3.3.0" // up to 3.3.0
val scala33 = "3.3.1" // up to 3.3.1
val scala3 = scala31

// Breeze 1.1+ is not available for scala211.
Expand All @@ -14,7 +14,13 @@ ThisBuild / scalaVersion := scala212
name := "scala-transformers"

lazy val root = (project in file("."))
.aggregate(common, tokenizer, encoder)
.aggregate(apps, common, tokenizer, encoder)
.settings(
publish / skip := true
)

lazy val apps = project
.dependsOn(encoder)
.settings(
publish / skip := true
)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package org.clulab.scala_transformers.tokenizer
package org.clulab.scala_transformers.common

import java.io.{FileInputStream, FileOutputStream}

Expand Down Expand Up @@ -33,6 +33,8 @@ object LibraryLoader {
else text

def isMac: Boolean = System.getProperty("os.name").startsWith("Mac ")

def isWin: Boolean = System.getProperty("os.name").startsWith("Win")

def isApple: Boolean = System.getProperty("os.arch") == "aarch64"

Expand All @@ -41,7 +43,7 @@ object LibraryLoader {
// The Mac reports a libname ending with .dylib, but Java needs .jnilib instead.
val jniName = replaceSuffix(libName, ".dylib", ".jnilib");
val resourceName =
if (!isMac) jniName
if (isWin) jniName
else if (isApple) "apple-" + jniName
else "intel-" + jniName

Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package org.clulab.scala_transformers.encoder.timer
package org.clulab.scala_transformers.common

import scala.collection.mutable.{HashMap => MutableHashMap}

Expand Down
25 changes: 21 additions & 4 deletions encoder/build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -2,24 +2,41 @@ name := "scala-transformers-encoder"
description := "Provides a Scala interface to huggingface encoders"

resolvers ++= Seq(
"Artifactory" at "https://artifactory.clulab.org/artifactory/sbt-release"
// "Artifactory" at "https://artifactory.clulab.org/artifactory/sbt-release"
)

libraryDependencies ++= {
val breezeVersion = CrossVersion.partialVersion(scalaVersion.value) match {
case Some((2, minor)) if minor < 12 => "1.0"
case _ => "2.1.0"
}
val ejmlVersion = "0.41" // Use this older version for Java 8.

Seq(
"org.scalanlp" %% "breeze" % breezeVersion,
// Choose one of these.
/// "org.apache.commons" % "commons-math3" % "3.6.1",
"org.ejml" % "ejml-core" % ejmlVersion,
"org.ejml" % "ejml-fdense" % ejmlVersion,
"org.ejml" % "ejml-simple" % ejmlVersion,
// "org.scalanlp" %% "breeze" % breezeVersion,

"com.microsoft.onnxruntime" % "onnxruntime" % "1.13.1",
//"org.clulab" % "roberta-onnx-model" % "0.0.2",
"org.clulab" % "deberta-onnx-model" % "0.0.3",
"org.slf4j" % "slf4j-api" % "1.7.10"
)
}

fork := true

// assembly / mainClass := Some("com.keithalcock.tokenizer.scalapy.apps.ExampleApp")

enablePlugins(ShadingPlugin)
shadedDependencies ++= Set(
"org.ejml" % "ejml-core" % "<ignored>",
"org.ejml" % "ejml-fdense" % "<ignored>",
"org.ejml" % "ejml-simple" % "<ignored>"
)
shadingRules ++= Seq(
ShadingRule.moveUnder("org.ejml", "org.clulab.shaded"),
ShadingRule.moveUnder("pabeles.concurrency", "org.clulab.shaded")
)
validNamespaces ++= Set("org", "org.clulab")
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
package org.clulab.scala_transformers.encoder

import breeze.linalg._
import BreezeUtils._
import breeze.linalg._
import org.clulab.scala_transformers.encoder.math.BreezeMath

object BreezeExamples extends App {
val m = mkRowMatrix[Float](Array(Array(1f, 2f), Array(3f, 4f)))
val m = BreezeMath.mkMatrixFromRows(Array(Array(1f, 2f), Array(3f, 4f)))
println(m)

println("Row 0: " + m(0, ::))
Expand All @@ -19,6 +19,6 @@ object BreezeExamples extends App {

dm(0, ::) :+= cm.t
dm(1, ::) :+= DenseVector.vertcat(m(1, ::).t, m(1, ::).t).t
println("After chaning dm:")
println("After changing dm:")
println(dm)
}
Loading

0 comments on commit 09218f7

Please sign in to comment.