Skip to content

Commit

Permalink
feat: extracting data for doc-string training
Browse files Browse the repository at this point in the history
  • Loading branch information
kim-em committed Nov 22, 2023
1 parent 44f48cf commit 40cfeb2
Show file tree
Hide file tree
Showing 4 changed files with 134 additions and 2 deletions.
31 changes: 29 additions & 2 deletions TrainingData/Frontend.lean
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,8 @@ the `src : Substring` and `stx : Syntax` of the command,
and any `Message`s and `InfoTree`s produced while processing.
-/
structure CompilationStep where
fileName : String
fileMap : FileMap
src : Substring
stx : Syntax
before : Environment
Expand All @@ -104,7 +106,8 @@ def one : FrontendM (CompilationStep × Bool) := do
let after := s'.env
let msgs := s'.messages.msgs.drop s.messages.msgs.size
let trees := s'.infoState.trees.drop s.infoState.trees.size
return ({ src, stx, before, after, msgs, trees }, done)
let ⟨_, fileName, fileMap⟩ := (← read).inputCtx
return ({ fileName, fileMap, src, stx, before, after, msgs, trees }, done)

/-- Process all commands in the input. -/
partial def all : FrontendM (List CompilationStep) := do
Expand All @@ -114,11 +117,35 @@ partial def all : FrontendM (List CompilationStep) := do
else
return cmd :: (← all)

def runCoreMBefore (c : CompilationStep) (x : CoreM α) : IO α :=
(·.1) <$> Core.CoreM.toIO x { fileName := c.fileName, fileMap := c.fileMap } { env := c.before }

open Meta in
def runMetaMBefore (c : CompilationStep) (x : MetaM α) : IO α :=
c.runCoreMBefore <| MetaM.run' x {} {}

/-- Return all new `ConstantInfo`s added during the processed command. -/
def diff (cmd : CompilationStep) : List ConstantInfo :=
cmd.after.constants.map₂.toList.filterMap
fun (c, i) => if cmd.before.constants.map₂.contains c then none else some i

/-- Data extracted from a `ConstantInfo`. -/
structure DeclInfo where
name : Name
type : Expr
ppType : String
docString : Option String

/-- Return info about each new declaration added during the processed command. -/
def newDecls (cmd : CompilationStep) : IO (List DeclInfo) := do
cmd.diff.mapM fun ci =>
return {
name := ci.name
type := ci.type
ppType := toString (← cmd.runMetaMBefore <| Meta.ppExpr ci.type)
docString := ← findDocString? cmd.after ci.name
}

end CompilationStep

/--
Expand Down Expand Up @@ -188,7 +215,7 @@ open System

-- TODO allow finding Lean 4 sources from the toolchain.
def findLean (mod : Name) : IO FilePath := do
return FilePath.mk ((← findOLean mod).toString.replace "build/lib/" "") |>.withExtension "lean"
return FilePath.mk ((← findOLean mod).toString.replace ".lake/build/lib/" "") |>.withExtension "lean"

/-- Implementation of `moduleSource`, which is the cached version of this function. -/
def moduleSource' (mod : Name) : IO String := do
Expand Down
5 changes: 5 additions & 0 deletions lakefile.lean
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,11 @@ lean_exe training_data where
root := `scripts.training_data
supportInterpreter := true

@[default_target]
lean_exe comment_data where
root := `scripts.comment_data
supportInterpreter := true

@[default_target]
lean_exe premises where
root := `scripts.premises
Expand Down
64 changes: 64 additions & 0 deletions scripts/comment_data.lean
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import TrainingData.Frontend
import TrainingData.InfoTree.ToJson
import TrainingData.InfoTree.TacticInvocation.Basic
import Mathlib.Data.String.Defs
import Mathlib.Lean.CoreM
import Std.Lean.Util.Path
import Std.Data.String.Basic
import Cli

open Lean Elab IO Meta
open Cli System

structure CommentData where
declName : String
declType : String
docString : Option String
context : String
decl : String
deriving ToJson, FromJson

def addStep (data : String × List CommentData) (step : CompilationStep) :
IO (String × List CommentData) := do
let (context, prev) := data
let decl := step.src.toString
let context := context ++ decl
let new ← step.newDecls
let next := new.map fun d =>
{ context
docString := d.docString
declName := d.name.toString
declType := d.ppType
decl }
return (context, next ++ prev)

def commentData (args : Cli.Parsed) : IO UInt32 := do
searchPathRef.set compile_time_search_path%
let module := args.positionalArg! "module" |>.as! ModuleName
let steps ← compileModule module
let (_, data) ← steps.foldlM (init := ("", [])) addStep
IO.println <| toJson data
return 0

/-- Setting up command line options and help text for `lake exe comment_data`. -/
def comment_data : Cmd := `[Cli|
comment_data VIA commentData; ["0.0.1"]
"Export doc-string training data from the given file.
The output is a json array of dictionaries with fields
* `declName`: the declaration name
* `declType`: the pretty-printed type of the declaration
* `docString`: the declaration doc-string, if it is present
* `decl`: the entire body of the declaration
* `context`: the file source up to before the declaration
"

ARGS:
module : ModuleName; "Lean module to compile and export training data."
]

/-- `lake exe comment_data` -/
def main (args : List String) : IO UInt32 :=
comment_data.validate args

-- See `scripts/comment_data.sh` for a script to run this on all of Mathlib.
36 changes: 36 additions & 0 deletions scripts/comment_data.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#!/usr/bin/env bash

## Extracts doc-strings for every declaration.

# See the help text in `lake exe comment_data` for a description of the output format.

# Run either as `scripts/comment_data.sh` to run on all of Mathlib (several hours),
# or `scripts/comment_data.sh Mathlib.Logic.Hydra` to run on just one file.
# Results will go in `out/comment_data/Mathlib.Logic.Hydra.comments`.

FLAGS=()
ARGS=()

for arg in "$@"; do
if [[ $arg == --* ]]; then
FLAGS+=("$arg")
else
ARGS+=("$arg")
fi
done

if [ ${#ARGS[@]} -eq 0 ]; then
lake build comment_data
parallel -j32 ./scripts/comment_data.sh ${FLAGS[@]} -- ::: `cat .lake/packages/mathlib/Mathlib.lean | sed -e 's/import //'`
else
DIR=out/comment_data
mkdir -p $DIR
mod=${ARGS[0]}
if [ ! -f $DIR/$mod.comment ]; then
echo $mod
if [ ! -f .lake/build/bin/comment_data ]; then
lake build comment_data
fi
.lake/build/bin/comment_data ${FLAGS[@]} $mod > $DIR/$mod._comment && mv $DIR/$mod._comment $DIR/$mod.comment
fi
fi

0 comments on commit 40cfeb2

Please sign in to comment.