From 40cfeb27dec304fc62d05f46737900870f24d208 Mon Sep 17 00:00:00 2001 From: Scott Morrison Date: Wed, 22 Nov 2023 13:25:06 +1100 Subject: [PATCH] feat: extracting data for doc-string training --- TrainingData/Frontend.lean | 31 ++++++++++++++++-- lakefile.lean | 5 +++ scripts/comment_data.lean | 64 ++++++++++++++++++++++++++++++++++++++ scripts/comment_data.sh | 36 +++++++++++++++++++++ 4 files changed, 134 insertions(+), 2 deletions(-) create mode 100644 scripts/comment_data.lean create mode 100755 scripts/comment_data.sh diff --git a/TrainingData/Frontend.lean b/TrainingData/Frontend.lean index c687eb5..0d4baa8 100644 --- a/TrainingData/Frontend.lean +++ b/TrainingData/Frontend.lean @@ -81,6 +81,8 @@ the `src : Substring` and `stx : Syntax` of the command, and any `Message`s and `InfoTree`s produced while processing. -/ structure CompilationStep where + fileName : String + fileMap : FileMap src : Substring stx : Syntax before : Environment @@ -104,7 +106,8 @@ def one : FrontendM (CompilationStep × Bool) := do let after := s'.env let msgs := s'.messages.msgs.drop s.messages.msgs.size let trees := s'.infoState.trees.drop s.infoState.trees.size - return ({ src, stx, before, after, msgs, trees }, done) + let ⟨_, fileName, fileMap⟩ := (← read).inputCtx + return ({ fileName, fileMap, src, stx, before, after, msgs, trees }, done) /-- Process all commands in the input. -/ partial def all : FrontendM (List CompilationStep) := do @@ -114,11 +117,35 @@ partial def all : FrontendM (List CompilationStep) := do else return cmd :: (← all) +def runCoreMBefore (c : CompilationStep) (x : CoreM α) : IO α := + (·.1) <$> Core.CoreM.toIO x { fileName := c.fileName, fileMap := c.fileMap } { env := c.before } + +open Meta in +def runMetaMBefore (c : CompilationStep) (x : MetaM α) : IO α := + c.runCoreMBefore <| MetaM.run' x {} {} + /-- Return all new `ConstantInfo`s added during the processed command. -/ def diff (cmd : CompilationStep) : List ConstantInfo := cmd.after.constants.map₂.toList.filterMap fun (c, i) => if cmd.before.constants.map₂.contains c then none else some i +/-- Data extracted from a `ConstantInfo`. -/ +structure DeclInfo where + name : Name + type : Expr + ppType : String + docString : Option String + +/-- Return info about each new declaration added during the processed command. -/ +def newDecls (cmd : CompilationStep) : IO (List DeclInfo) := do + cmd.diff.mapM fun ci => + return { + name := ci.name + type := ci.type + ppType := toString (← cmd.runMetaMBefore <| Meta.ppExpr ci.type) + docString := ← findDocString? cmd.after ci.name + } + end CompilationStep /-- @@ -188,7 +215,7 @@ open System -- TODO allow finding Lean 4 sources from the toolchain. def findLean (mod : Name) : IO FilePath := do - return FilePath.mk ((← findOLean mod).toString.replace "build/lib/" "") |>.withExtension "lean" + return FilePath.mk ((← findOLean mod).toString.replace ".lake/build/lib/" "") |>.withExtension "lean" /-- Implementation of `moduleSource`, which is the cached version of this function. -/ def moduleSource' (mod : Name) : IO String := do diff --git a/lakefile.lean b/lakefile.lean index 0de323c..dacd289 100644 --- a/lakefile.lean +++ b/lakefile.lean @@ -29,6 +29,11 @@ lean_exe training_data where root := `scripts.training_data supportInterpreter := true +@[default_target] +lean_exe comment_data where + root := `scripts.comment_data + supportInterpreter := true + @[default_target] lean_exe premises where root := `scripts.premises diff --git a/scripts/comment_data.lean b/scripts/comment_data.lean new file mode 100644 index 0000000..1083089 --- /dev/null +++ b/scripts/comment_data.lean @@ -0,0 +1,64 @@ +import TrainingData.Frontend +import TrainingData.InfoTree.ToJson +import TrainingData.InfoTree.TacticInvocation.Basic +import Mathlib.Data.String.Defs +import Mathlib.Lean.CoreM +import Std.Lean.Util.Path +import Std.Data.String.Basic +import Cli + +open Lean Elab IO Meta +open Cli System + +structure CommentData where + declName : String + declType : String + docString : Option String + context : String + decl : String +deriving ToJson, FromJson + +def addStep (data : String × List CommentData) (step : CompilationStep) : + IO (String × List CommentData) := do + let (context, prev) := data + let decl := step.src.toString + let context := context ++ decl + let new ← step.newDecls + let next := new.map fun d => + { context + docString := d.docString + declName := d.name.toString + declType := d.ppType + decl } + return (context, next ++ prev) + +def commentData (args : Cli.Parsed) : IO UInt32 := do + searchPathRef.set compile_time_search_path% + let module := args.positionalArg! "module" |>.as! ModuleName + let steps ← compileModule module + let (_, data) ← steps.foldlM (init := ("", [])) addStep + IO.println <| toJson data + return 0 + +/-- Setting up command line options and help text for `lake exe comment_data`. -/ +def comment_data : Cmd := `[Cli| + comment_data VIA commentData; ["0.0.1"] +"Export doc-string training data from the given file. + +The output is a json array of dictionaries with fields +* `declName`: the declaration name +* `declType`: the pretty-printed type of the declaration +* `docString`: the declaration doc-string, if it is present +* `decl`: the entire body of the declaration +* `context`: the file source up to before the declaration +" + + ARGS: + module : ModuleName; "Lean module to compile and export training data." +] + +/-- `lake exe comment_data` -/ +def main (args : List String) : IO UInt32 := + comment_data.validate args + +-- See `scripts/comment_data.sh` for a script to run this on all of Mathlib. diff --git a/scripts/comment_data.sh b/scripts/comment_data.sh new file mode 100755 index 0000000..1171ef7 --- /dev/null +++ b/scripts/comment_data.sh @@ -0,0 +1,36 @@ +#!/usr/bin/env bash + +## Extracts doc-strings for every declaration. + +# See the help text in `lake exe comment_data` for a description of the output format. + +# Run either as `scripts/comment_data.sh` to run on all of Mathlib (several hours), +# or `scripts/comment_data.sh Mathlib.Logic.Hydra` to run on just one file. +# Results will go in `out/comment_data/Mathlib.Logic.Hydra.comments`. + +FLAGS=() +ARGS=() + +for arg in "$@"; do + if [[ $arg == --* ]]; then + FLAGS+=("$arg") + else + ARGS+=("$arg") + fi +done + +if [ ${#ARGS[@]} -eq 0 ]; then + lake build comment_data + parallel -j32 ./scripts/comment_data.sh ${FLAGS[@]} -- ::: `cat .lake/packages/mathlib/Mathlib.lean | sed -e 's/import //'` +else + DIR=out/comment_data + mkdir -p $DIR + mod=${ARGS[0]} + if [ ! -f $DIR/$mod.comment ]; then + echo $mod + if [ ! -f .lake/build/bin/comment_data ]; then + lake build comment_data + fi + .lake/build/bin/comment_data ${FLAGS[@]} $mod > $DIR/$mod._comment && mv $DIR/$mod._comment $DIR/$mod.comment + fi +fi