diff --git a/README.md b/README.md index e7cdce5..4edb8d7 100644 --- a/README.md +++ b/README.md @@ -24,6 +24,9 @@ We would like to enable sharing of data and easy access to the Lean libraries fo : Export all goal / tactic pairs from the library, with additional context. `--proofstep` outputs in `[GOAL]...[PROOFSTEP]...` format. +`comment_data` +: Export all declaration / type / doc-string tuples from the library, with additional context. + `tactic_benchmark` : Run a tactic against every declaration in the library, tracking runtimes and successes. @@ -195,6 +198,21 @@ Here: There is also `scripts/training_data.sh`, which will run in parallel over all of Mathlib, recording results in `out/training_data/`. +### `comment_data` + +`lake exe comment_data Mathlib.Logic.Hydra` will output information about all doc-strings in a file. + +The output is a json array of dictionaries with fields +* `declName`: the declaration name +* `declType`: the pretty-printed type of the declaration +* `docString`: the declaration doc-string, if it is present +* `decl`: the entire body of the declaration +* `context`: the file source up to before the declaration + (this currently does not include the imports) + +There is also `scripts/comment_data.sh`, which will run in parallel over all of Mathlib, +recording results in `out/comment_data/`. + ### `tactic_benchmark` `lake exe tactic_benchmark --aesop Mathlib.Topology.ContinuousFunction.Ordered` diff --git a/TrainingData/Frontend.lean b/TrainingData/Frontend.lean index 0d4baa8..db364a8 100644 --- a/TrainingData/Frontend.lean +++ b/TrainingData/Frontend.lean @@ -38,7 +38,7 @@ The functions `compileModule : Name → IO (List CompilationStep)` and set_option autoImplicit true -open Lean Elab Frontend +open Lean Elab Frontend Meta namespace Lean.PersistentArray @@ -71,6 +71,22 @@ partial def runStateRefT [Monad m] [MonadLiftT (ST ω) m] (L : MLList (StateRefT end MLList +private def isInternal' (declName : Name) : Bool := + declName.isInternal || + match declName with + | .str _ s => "match_".isPrefixOf s || "proof_".isPrefixOf s + | _ => true + +-- from Lean.Server.Completion +private def isBlackListed {m} [Monad m] [MonadEnv m] (declName : Name) : m Bool := do + if declName == ``sorryAx then return true + if declName matches .str _ "inj" then return true + if declName matches .str _ "noConfusionType" then return true + let env ← getEnv + pure $ isInternal' declName + || isAuxRecursor env declName + || isNoConfusion env declName + <||> isRec declName <||> isMatcher declName namespace Lean.Elab.IO /-- @@ -138,11 +154,13 @@ structure DeclInfo where /-- Return info about each new declaration added during the processed command. -/ def newDecls (cmd : CompilationStep) : IO (List DeclInfo) := do - cmd.diff.mapM fun ci => - return { + cmd.diff.filterMapM fun ci => cmd.runMetaMBefore do + if ← isBlackListed ci.name then + pure none + else pure <| some { name := ci.name type := ci.type - ppType := toString (← cmd.runMetaMBefore <| Meta.ppExpr ci.type) + ppType := toString (← Meta.ppExpr ci.type) docString := ← findDocString? cmd.after ci.name } diff --git a/make.sh b/make.sh index 78330d3..8c0b932 100755 --- a/make.sh +++ b/make.sh @@ -9,6 +9,8 @@ DATE=`date "+%Y-%m-%d"` mkdir -p out +lake build + lake exe declaration_types > out/$DATE-declaration_types gzip -f out/$DATE-declaration_types @@ -37,5 +39,11 @@ cd out tar cvzf $DATE-proof_step.tgz training_data cd .. +rm -rf out/comment_data +./scripts/comment_data.sh +cd out +tar cvzf $DATE-comment_data.tgz comment_data +cd .. + rm -rf out/tactic_benchmark ./scripts/tactic_benchmark.sh --aesop diff --git a/scripts/comment_data.lean b/scripts/comment_data.lean index 1083089..f5b4d52 100644 --- a/scripts/comment_data.lean +++ b/scripts/comment_data.lean @@ -51,6 +51,7 @@ The output is a json array of dictionaries with fields * `docString`: the declaration doc-string, if it is present * `decl`: the entire body of the declaration * `context`: the file source up to before the declaration + (this currently does not include the imports) " ARGS: