Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable running extractions on all files in a directory #304

Merged
merged 5 commits into from
Jan 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions docs/functions.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,10 @@ The following options are available for most functions unless stated otherwise.

Use the option `--inputfile` to specify a path to a file containing input text.

For the `extract` command, this may be a single file or a directory of files.

In the latter case, all .txt files will be assumed to be input, and the path will *not* be parsed recursively.

### template

Use the option `--template` to specify a template to use. This is a required parameter.
Expand Down
50 changes: 36 additions & 14 deletions src/ontogpt/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,8 +277,14 @@ def extract(

ontogpt extract -t gocam.GoCamAnnotations -i gocam-27929086.txt

The input argument must be either a file path or a string.
Use the -i/--input-file option followed by the path to the input file if using the former.
The input argument may be:
A file path,
A directory,
or a string.
Use the -i/--input-file option followed by the path to the input file
or directory.
If the input is a directory, all files with the .txt extension will be read.
This is not recursive.
Otherwise, the input is assumed to be a string to be read as input.

You can also use fragments of existing schemas, use the --target-class option (-T) to
Expand All @@ -298,9 +304,17 @@ def extract(
model_source = selectmodel["provider"]
model_name = selectmodel["alternative_names"][0]

inputlist = []

if not inputfile or inputfile == "-":
text = sys.stdin.read()
if inputfile and Path(inputfile).exists():
inputlist.append(text)
elif inputfile and Path(inputfile).is_dir():
logging.info(f"Input file directory: {inputfile}")
inputfiles = Path(inputfile).glob("*.txt")
inputlist = [open(f, "r").read() for f in inputfiles if f.is_file()]
logging.info(f"Found {len(inputlist)} input files here.")
elif inputfile and Path(inputfile).exists():
logging.info(f"Input file: {inputfile}")
if use_textract:
import textract
Expand All @@ -309,6 +323,7 @@ def extract(
else:
text = open(inputfile, "r").read()
logging.info(f"Input text: {text}")
inputlist.append(text)
elif inputfile and not Path(inputfile).exists():
raise FileNotFoundError(f"Cannot find input file {inputfile}")

Expand All @@ -333,12 +348,20 @@ def extract(
target_class_def = ke.schemaview.get_class(target_class)
else:
target_class_def = None
results = ke.extract_from_text(text=text, cls=target_class_def, show_prompt=show_prompt)
if set_slot_value:
for slot_value in set_slot_value:
slot, value = slot_value.split("=")
setattr(results.extracted_object, slot, value)
write_extraction(results, output, output_format, ke)

i = 0
for input_entry in inputlist:
if len(inputlist) > 1:
i = i + 1
logging.info(f"Now extracting from file {i} of {len(inputlist)}")
results = ke.extract_from_text(
text=input_entry, cls=target_class_def, show_prompt=show_prompt
)
if set_slot_value:
for slot_value in set_slot_value:
slot, value = slot_value.split("=")
setattr(results.extracted_object, slot, value)
write_extraction(results, output, output_format, ke)


@main.command()
Expand Down Expand Up @@ -1464,7 +1487,7 @@ def eval_enrichment(genes, input_file, number_to_drop, annotations_path, model,
default=False,
show_default=True,
help="If set, chunk input text, then prepare a separate prompt for each chunk."
" Otherwise the full input text is passed.",
" Otherwise the full input text is passed.",
)
@click.argument("evaluator")
def eval(evaluator, num_tests, output, chunking, model, **kwargs):
Expand All @@ -1477,10 +1500,9 @@ def eval(evaluator, num_tests, output, chunking, model, **kwargs):
else:
modelname = DEFAULT_MODEL

evaluator = create_evaluator(name=evaluator,
num_tests=num_tests,
chunking=chunking,
model=modelname)
evaluator = create_evaluator(
name=evaluator, num_tests=num_tests, chunking=chunking, model=modelname
)
eos = evaluator.eval()
output.write(dump_minimal_yaml(eos, minimize=False))

Expand Down
Loading