Skip to content

Commit

Permalink
Merge pull request #50 from clulab/dualsum
Browse files Browse the repository at this point in the history
Dualsum
  • Loading branch information
kwalcock authored Nov 30, 2023
2 parents 27640ae + 09218f7 commit 00f1a34
Show file tree
Hide file tree
Showing 32 changed files with 1,878 additions and 234 deletions.
4 changes: 4 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
+ **0.6.2** - Shade additional package internal to EJML
+ **0.6.1** - Shade EJML
+ **0.6.0** - Calculate with EJML rather than Breeze
+ **0.6.0** - Use sum instead of concat
+ **0.5.0** - Support Linux on aarch64
+ **0.5.0** - Isolate dependencies on models to the apps subproject
+ **0.4.0** - Account for maxTokens
Expand Down
5 changes: 3 additions & 2 deletions apps/build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@ resolvers ++= Seq(

libraryDependencies ++= {
Seq(
"org.clulab" % "roberta-onnx-model" % "0.1.0",
"org.clulab" % "deberta-onnx-model" % "0.1.0",
"org.clulab" % "deberta-onnx-model" % "0.2.0",
"org.clulab" % "electra-onnx-model" % "0.2.0",
"org.clulab" % "roberta-onnx-model" % "0.2.0",
"org.scalatest" %% "scalatest" % "3.2.15" % "test"
)
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package org.clulab.scala_transformers.encoder.apps
package org.clulab.scala_transformers.apps

import dev.ludovic.netlib.blas.{BLAS, JavaBLAS, NativeBLAS}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,12 @@ import org.clulab.scala_transformers.tokenizer.LongTokenization
import org.clulab.scala_transformers.tokenizer.jni.ScalaJniTokenizer

object LoadExampleFromFileApp extends App {
val baseName = args.lift(0).getOrElse("../tcmodel")
// Choose one of these.
val defaultBaseName = "../models/microsoft_deberta_v3_base_mtl/avg_export"
// val defaultBaseName = "../models/google_electra_small_discriminator_mtl/avg_export"
// val defaultBaseName = "../models/roberta_base_mtl/avg_export"

val baseName = args.lift(0).getOrElse(defaultBaseName)
val tokenClassifierLayout = new TokenClassifierLayout(baseName)
val tokenClassifierFactory = new TokenClassifierFactoryFromFiles(tokenClassifierLayout)
val words = Array("EU", "rejects", "German", "call", "to", "boycott", "British", "lamb", ".")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,11 @@ import org.clulab.scala_transformers.tokenizer.LongTokenization
import org.clulab.scala_transformers.tokenizer.jni.ScalaJniTokenizer

object LoadExampleFromResourceApp extends App {
val baseName = "/org/clulab/scala_transformers/models/roberta_base_mtl/avg_export"
// Choose one of these.
val baseName = "/org/clulab/scala_transformers/models/microsoft_deberta_v3_base_mtl/avg_export"
// val baseName = "/org/clulab/scala_transformers/models/google_electra_small_discriminator_mtl/avg_export"
// val baseName = "/org/clulab/scala_transformers/models/roberta_base_mtl/avg_export"

val tokenClassifierLayout = new TokenClassifierLayout(baseName)
val tokenClassifierFactory = new TokenClassifierFactoryFromResources(tokenClassifierLayout)
val words = Array("EU", "rejects", "German", "call", "to", "boycott", "British", "lamb", ".")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,14 @@ package org.clulab.scala_transformers.apps

import org.clulab.scala_transformers.encoder.TokenClassifier

/*
import java.io.File
import org.clulab.scala_transformers.tokenizer.jni.ScalaJniTokenizer
import org.clulab.scala_transformers.tokenizer.LongTokenization
*/

object TokenClassifierExampleApp extends App {
//val tokenClassifier = TokenClassifier.fromFiles("../../scala-transformers-models/roberta-base-mtl/avg_export")
val tokenClassifier = TokenClassifier.fromFiles("../microsoft-deberta-v3-base-mtl/avg_export")
//val tokenClassifier = TokenClassifier.fromResources("/org/clulab/scala_transformers/models/microsoft_deberta_v3_base_mtl/avg_export")
// Choose one of these.
val tokenClassifier = TokenClassifier.fromFiles("../models/microsoft_deberta_v3_base_mtl/avg_export")
// val tokenClassifier = TokenClassifier.fromResources("/org/clulab/scala_transformers/models/microsoft_deberta_v3_base_mtl/avg_export")
// val tokenClassifier = TokenClassifier.fromFiles("../models/google_electra_small_discriminator_mtl/avg_export")
// val tokenClassifier = TokenClassifier.fromResources("/org/clulab/scala_transformers/models/google_electra_small_discriminator_mtl/avg_export")
// val tokenClassifier = TokenClassifier.fromFiles("../models/roberta_base_mtl/avg_export")
// val tokenClassifier = TokenClassifier.fromResources("/org/clulab/scala_transformers/models/roberta_base_mtl/avg_export")

//val words = Seq("EU", "rejects", "German", "call", "to", "boycott", "British", "lamb", ".")
val words = Seq("John", "Doe", "went", "to", "China", ".")
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
package org.clulab.scala_transformers.apps

import org.clulab.scala_transformers.common.Timers
import org.clulab.scala_transformers.encoder.{EncoderMaxTokensRuntimeException, TokenClassifier}
import org.clulab.scala_transformers.tokenizer.LongTokenization

import scala.io.Source

object TokenClassifierTimerApp extends App {

class TimedTokenClassifier(tokenClassifier: TokenClassifier) extends TokenClassifier(
tokenClassifier.encoder, tokenClassifier.maxTokens, tokenClassifier.tasks, tokenClassifier.tokenizer
) {
val tokenizeTimer = Timers.getOrNew("Tokenizer")
val forwardTimer = Timers.getOrNew("Encoder.forward")
val predictTimers = tokenClassifier.tasks.indices.map { index =>
val name = tasks(index).name

Timers.getOrNew(s"Encoder.predict $index\t$name")
}

// NOTE: This should be copied from the base class and then instrumented with timers.
override def predictWithScores(words: Seq[String], headTaskName: String = "Deps Head"): Array[Array[Array[(String, Float)]]] = {
// This condition must be met in order for allLabels to be filled properly without nulls.
// The condition is not checked at runtime!
// if (tasks.exists(_.dual))
// require(tasks.count(task => !task.dual && task.name == headTaskName) == 1)

// tokenize to subword tokens
val tokenization = tokenizeTimer.time {
LongTokenization(tokenizer.tokenize(words.toArray))
}
val inputIds = tokenization.tokenIds
val wordIds = tokenization.wordIds
val tokens = tokenization.tokens

if (inputIds.length > maxTokens) {
throw new EncoderMaxTokensRuntimeException(s"Encoder error: the following text contains more tokens than the maximum number accepted by this encoder ($maxTokens): ${tokens.mkString(", ")}")
}

// run the sentence through the transformer encoder
val encOutput = forwardTimer.time {
encoder.forward(inputIds)
}

// outputs for all tasks stored here: task x tokens in sentence x scores per token
val allLabels = new Array[Array[Array[(String, Float)]]](tasks.length)
// all heads predicted for every token
// dimensions: token x heads
var heads: Option[Array[Array[Int]]] = None

// now generate token label predictions for all primary tasks (not dual!)
for (i <- tasks.indices) {
if (!tasks(i).dual) {
val tokenLabels = predictTimers(i).time {
tasks(i).predictWithScores(encOutput, None, None)
}
val wordLabels = TokenClassifier.mapTokenLabelsAndScoresToWords(tokenLabels, tokenization.wordIds)
allLabels(i) = wordLabels

// if this is the task that predicts head positions, then save them for the dual tasks
// we save all the heads predicted for each token
if (tasks(i).name == headTaskName) {
heads = Some(tokenLabels.map(_.map(_._1.toInt)))
}
}
}

// generate outputs for the dual tasks, if heads were predicted by one of the primary tasks
// the dual task(s) must be aligned with the heads.
// that is, we predict the top label for each of the head candidates
if (heads.isDefined) {
//println("Tokens: " + tokens.mkString(", "))
//println("Heads:\n\t" + heads.get.map(_.slice(0, 3).mkString(", ")).mkString("\n\t"))
//println("Masks: " + TokenClassifier.mkTokenMask(wordIds).mkString(", "))
val masks = Some(TokenClassifier.mkTokenMask(wordIds))

for (i <- tasks.indices) {
if (tasks(i).dual) {
val tokenLabels = predictTimers(i).time {
tasks(i).predictWithScores(encOutput, heads, masks)
}
val wordLabels = TokenClassifier.mapTokenLabelsAndScoresToWords(tokenLabels, tokenization.wordIds)
allLabels(i) = wordLabels
}
}
}

allLabels
}
}

val verbose = false
val fileName = args.lift(0).getOrElse("../corpora/sentences/sentences.txt")
// Choose one of these.
val untimedTokenClassifier = TokenClassifier.fromFiles("../models/microsoft_deberta_v3_base_mtl/avg_export")
// val untimedTokenClassifier = TokenClassifier.fromFiles("../models/google_electra_small_discriminator_mtl/avg_export")
// val untimedTokenClassifier = TokenClassifier.fromFiles("../models/roberta_base_mtl/avg_export")

val tokenClassifier = new TimedTokenClassifier(untimedTokenClassifier)
val lines = {
val source = Source.fromFile(fileName)
val lines = source.getLines().take(100).toArray

source.close
lines
}
val elapsedTimer = Timers.getOrNew("Elapsed")

elapsedTimer.time {
lines.zipWithIndex/*.par*/.foreach { case (line, index) =>
println(s"$index $line")
if (index != 1382) {
val words = line.split(" ").toSeq
val allLabelSeqs = tokenClassifier.predictWithScores(words)

if (verbose) {
println(s"Words: ${words.mkString(", ")}")
for (layer <- allLabelSeqs) {
val words = layer.map(_.head) // Collapse the next layer by just taking the head.
val wordLabels = words.map(_._1)

println(s"Labels: ${wordLabels.mkString(", ")}")
}
}
}
}
}
Timers.summarize()
}
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package org.clulab.scala_transformers.encoder.timer
package org.clulab.scala_transformers.common

import scala.collection.mutable.{HashMap => MutableHashMap}

Expand Down
21 changes: 20 additions & 1 deletion encoder/build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,16 @@ libraryDependencies ++= {
case Some((2, minor)) if minor < 12 => "1.0"
case _ => "2.1.0"
}
val ejmlVersion = "0.41" // Use this older version for Java 8.

Seq(
"org.scalanlp" %% "breeze" % breezeVersion,
// Choose one of these.
/// "org.apache.commons" % "commons-math3" % "3.6.1",
"org.ejml" % "ejml-core" % ejmlVersion,
"org.ejml" % "ejml-fdense" % ejmlVersion,
"org.ejml" % "ejml-simple" % ejmlVersion,
// "org.scalanlp" %% "breeze" % breezeVersion,

"com.microsoft.onnxruntime" % "onnxruntime" % "1.13.1",
"org.slf4j" % "slf4j-api" % "1.7.10"
)
Expand All @@ -21,3 +28,15 @@ libraryDependencies ++= {
fork := true

// assembly / mainClass := Some("com.keithalcock.tokenizer.scalapy.apps.ExampleApp")

enablePlugins(ShadingPlugin)
shadedDependencies ++= Set(
"org.ejml" % "ejml-core" % "<ignored>",
"org.ejml" % "ejml-fdense" % "<ignored>",
"org.ejml" % "ejml-simple" % "<ignored>"
)
shadingRules ++= Seq(
ShadingRule.moveUnder("org.ejml", "org.clulab.shaded"),
ShadingRule.moveUnder("pabeles.concurrency", "org.clulab.shaded")
)
validNamespaces ++= Set("org", "org.clulab")
4 changes: 2 additions & 2 deletions encoder/src/main/python/averaging_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def print_some_params(self, model: TokenClassificationModel, msg: str) -> None:
ShortTaskDef("Chunking", "chunking/", "train.txt", "test.txt", "test.txt"),
#ShortTaskDef("Deps Head", "deps-wsj/", "train.heads", "dev.heads", "test.heads"),
#ShortTaskDef("Deps Label", "deps-wsj/", "train.labels", "dev.labels", "test.labels", dual_mode=True)
ShortTaskDef("Deps Head", "deps-combined/", "wsjtrain-wsjdev-geniatrain-geniadev.heads", "dev.heads", "test.heads"),
ShortTaskDef("Deps Label", "deps-combined/", "wsjtrain-wsjdev-geniatrain-geniadev.labels", "dev.labels", "test.labels", dual_mode=True)
ShortTaskDef("Deps Head", "deps-combined/", "wsjtrain-wsjdev-geniatrain-geniadev.heads", "test.heads", "test.heads"),
ShortTaskDef("Deps Label", "deps-combined/", "wsjtrain-wsjdev-geniatrain-geniadev.labels", "test.labels", "test.labels", dual_mode=True)
])
AveragingTrainer(tokenizer).train(tasks)
4 changes: 2 additions & 2 deletions encoder/src/main/python/clu_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,8 @@ def compute_metrics(self, eval_pred: EvalPrediction) -> Dict[str, float]:
ShortTaskDef("NER", "conll-ner/", "train.txt", "dev.txt", "test.txt"),
ShortTaskDef("POS", "pos/", "train.txt", "dev.txt", "test.txt"),
ShortTaskDef("Chunking", "chunking/", "train.txt", "test.txt", "test.txt"), # this dataset has no dev
ShortTaskDef("Deps Head", "deps-combined/", "wsjtrain-wsjdev-geniatrain-geniadev.heads", "dev.heads", "test.heads"),
ShortTaskDef("Deps Label", "deps-combined/", "wsjtrain-wsjdev-geniatrain-geniadev.labels", "dev.labels", "test.labels", dual_mode=True)
ShortTaskDef("Deps Head", "deps-combined/", "wsjtrain-wsjdev-geniatrain-geniadev.heads", "test.heads", "test.heads"), # dev is included in train
ShortTaskDef("Deps Label", "deps-combined/", "wsjtrain-wsjdev-geniatrain-geniadev.labels", "test.labels", "test.labels", dual_mode=True) # dev is included in train
#ShortTaskDef("Deps Head", "deps-wsj/", "train.heads", "dev.heads", "test.heads"),
#ShortTaskDef("Deps Label", "deps-wsj/", "train.labels", "dev.labels", "test.labels", dual_mode=True)
])
Expand Down
7 changes: 5 additions & 2 deletions encoder/src/main/python/token_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ def __init__(self, hidden_size: int, num_labels: int, task_id, dual_mode: bool=F
self.dropout = nn.Dropout(dropout_p)
self.dual_mode = dual_mode
self.classifier = nn.Linear(
hidden_size if not self.dual_mode else hidden_size * 2,
hidden_size, # if not self.dual_mode else hidden_size * 2, # USE SUM
num_labels
)
self.num_labels = num_labels
Expand All @@ -248,8 +248,11 @@ def concatenate(self, sequence_output, head_positions):
head_states = sequence_output[torch.arange(sequence_output.shape[0]).unsqueeze(1), long_head_positions]
#print(f"head_states.size = {head_states.size()}")
# Concatenate the hidden states from modifier + head.
modifier_head_states = torch.cat([sequence_output, head_states], dim=2)
#modifier_head_states = torch.cat([sequence_output, head_states], dim=2)
modifier_head_states = torch.add(sequence_output, head_states) # USE SUM
#print(f"modifier_head_states.size = {modifier_head_states.size()}")
#print("EXIT")
#exit(1)
return modifier_head_states

def forward(self, sequence_output, pooled_output, head_positions, labels=None, attention_mask=None, **kwargs):
Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
package org.clulab.scala_transformers.encoder

import breeze.linalg._
import BreezeUtils._
import breeze.linalg._
import org.clulab.scala_transformers.encoder.math.BreezeMath

object BreezeExamples extends App {
val m = mkRowMatrix[Float](Array(Array(1f, 2f), Array(3f, 4f)))
val m = BreezeMath.mkMatrixFromRows(Array(Array(1f, 2f), Array(3f, 4f)))
println(m)

println("Row 0: " + m(0, ::))
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
package org.clulab.scala_transformers.encoder

import ai.onnxruntime.{OnnxTensor, OrtEnvironment, OrtSession}
import breeze.linalg.DenseMatrix
import org.clulab.scala_transformers.encoder.math.Mathematics.{Math, MathMatrix}

import java.io.DataInputStream
import java.util.{HashMap => JHashMap}
Expand All @@ -13,12 +13,12 @@ class Encoder(val encoderEnvironment: OrtEnvironment, val encoderSession: OrtSes
* @param batchInputIds First dimension is batch size (1 for a single sentence); second is sentence size
* @return Hidden states for the whole batch. The matrix dimension: rows = sentence size; columns = hidden state size
*/
def forward(batchInputIds: Array[Array[Long]]): Array[DenseMatrix[Float]] = {
def forward(batchInputIds: Array[Array[Long]]): Array[MathMatrix] = {
val inputs = new JHashMap[String, OnnxTensor]()
inputs.put("token_ids", OnnxTensor.createTensor(encoderEnvironment, batchInputIds))

val encoderOutput = encoderSession.run(inputs).get(0).getValue.asInstanceOf[Array[Array[Array[Float]]]]
val outputs = encoderOutput.map(BreezeUtils.mkRowMatrix(_))
val result: OrtSession.Result = encoderSession.run(inputs)
val outputs = Math.fromResult(result)
outputs
}

Expand All @@ -27,7 +27,7 @@ class Encoder(val encoderEnvironment: OrtEnvironment, val encoderSession: OrtSes
* @param inputIds Array of token ids for this sentence
* @return Hidden states for this sentence. The matrix dimension: rows = sentence size; columns = hidden state size
*/
def forward(inputIds: Array[Long]): DenseMatrix[Float] = {
def forward(inputIds: Array[Long]): MathMatrix = {
val batchInputIds = Array(inputIds)
forward(batchInputIds).head
}
Expand Down
Loading

0 comments on commit 00f1a34

Please sign in to comment.