Skip to content

Commit

Permalink
perf: Python performance improvements with ruff C4 and PERF fixes (#5803
Browse files Browse the repository at this point in the history
)

* Python performance improvements with ruff C4 and PERF

* pre-commit fixes

* Revert changes to examples/basic_qa_pipeline.py

* Revert changes to haystack/preview/testing/document_store.py

* revert releasenotes

* Upgrade to ruff v0.0.290
  • Loading branch information
cclauss authored Sep 16, 2023
1 parent 1bc03dd commit 91ab90a
Show file tree
Hide file tree
Showing 44 changed files with 90 additions and 90 deletions.
4 changes: 2 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@ repos:
- id: black-jupyter

- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.0.289
rev: v0.0.290
hooks:
- id: ruff
- id: ruff

- repo: https://github.com/codespell-project/codespell
rev: v2.2.5
Expand Down
10 changes: 5 additions & 5 deletions e2e/modeling/test_dpr.py
Original file line number Diff line number Diff line change
Expand Up @@ -777,7 +777,7 @@ def test_dpr_processor_save_load_non_bert_tokenizer(tmp_path: Path, query_and_pa

# generate embeddings with model loaded from model hub
dataset, tensor_names, _, __ = processor.dataset_from_dicts(
dicts=[d], indices=[i for i in range(len([d]))], return_baskets=True
dicts=[d], indices=list(range(len([d]))), return_baskets=True
)

data_loader = NamedDataLoader(
Expand Down Expand Up @@ -811,7 +811,7 @@ def test_dpr_processor_save_load_non_bert_tokenizer(tmp_path: Path, query_and_pa

# generate embeddings with model loaded from disk
dataset2, tensor_names2, _, __ = loaded_processor.dataset_from_dicts(
dicts=[d], indices=[i for i in range(len([d]))], return_baskets=True
dicts=[d], indices=list(range(len([d]))), return_baskets=True
)

data_loader = NamedDataLoader(
Expand All @@ -820,7 +820,7 @@ def test_dpr_processor_save_load_non_bert_tokenizer(tmp_path: Path, query_and_pa
all_embeddings2: Dict[str, Any] = {"query": [], "passages": []}
loaded_model.eval()

for i, batch in enumerate(tqdm(data_loader, desc="Creating Embeddings", unit=" Batches", disable=True)):
for batch in tqdm(data_loader, desc="Creating Embeddings", unit=" Batches", disable=True):
batch = {key: batch[key].to(device) for key in batch}

# get logits
Expand Down Expand Up @@ -904,7 +904,7 @@ def test_dpr_processor_save_load_non_bert_tokenizer(tmp_path: Path, query_and_pa

# generate embeddings with model loaded from disk that originated from a FARM style model that was saved to disk earlier
dataset3, tensor_names3, _, __ = loaded_processor.dataset_from_dicts(
dicts=[d], indices=[i for i in range(len([d]))], return_baskets=True
dicts=[d], indices=list(range(len([d]))), return_baskets=True
)

data_loader = NamedDataLoader(
Expand All @@ -913,7 +913,7 @@ def test_dpr_processor_save_load_non_bert_tokenizer(tmp_path: Path, query_and_pa
all_embeddings3: Dict[str, Any] = {"query": [], "passages": []}
loaded_model.eval()

for i, batch in enumerate(tqdm(data_loader, desc="Creating Embeddings", unit=" Batches", disable=True)):
for batch in tqdm(data_loader, desc="Creating Embeddings", unit=" Batches", disable=True):
batch = {key: batch[key].to(device) for key in batch}

# get logits
Expand Down
2 changes: 1 addition & 1 deletion haystack/document_stores/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -595,7 +595,7 @@ def _drop_duplicate_documents(self, documents: List[Document], index: Optional[s
:param index: name of the index
:return: A list of Haystack Document objects.
"""
_hash_ids: Set = set([])
_hash_ids: Set = set()
_documents: List[Document] = []

for document in documents:
Expand Down
6 changes: 3 additions & 3 deletions haystack/document_stores/opensearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -1215,11 +1215,11 @@ def _get_embedding_field_mapping(
def _ivf_model_exists(self, index: str) -> bool:
if self._index_exists(".opensearch-knn-models"):
response = self.client.transport.perform_request("GET", "/_plugins/_knn/models/_search")
existing_ivf_models = set(
existing_ivf_models = {
model["_source"]["model_id"]
for model in response["hits"]["hits"]
if model["_source"]["state"] != "failed"
)
}
else:
existing_ivf_models = set()

Expand Down Expand Up @@ -1461,7 +1461,7 @@ def _delete_ivf_model(self, index: str):
"""
if self._index_exists(".opensearch-knn-models"):
response = self.client.transport.perform_request("GET", "/_plugins/_knn/models/_search")
existing_ivf_models = set(model["_source"]["model_id"] for model in response["hits"]["hits"])
existing_ivf_models = {model["_source"]["model_id"] for model in response["hits"]["hits"]}
if f"{index}-ivf" in existing_ivf_models:
self.client.transport.perform_request("DELETE", f"/_plugins/_knn/models/{index}-ivf")

Expand Down
2 changes: 1 addition & 1 deletion haystack/modeling/data_handler/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def flatten_rename(
assert any(key in encoded_batch for key in keys), f"one of the keys {keys} is not in batch {encoded_batch.keys()}"
features_flat = []
for item in range(len(encoded_batch[keys[0]])):
feat_dict = {k: v for k, v in zip(renamed_keys, [encoded_batch[k][item] for k in keys])}
feat_dict = dict(zip(renamed_keys, [encoded_batch[k][item] for k in keys]))
features_flat.append(feat_dict)
return features_flat

Expand Down
14 changes: 7 additions & 7 deletions haystack/modeling/data_handler/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ def load_from_dir(cls, load_dir: str):
config = json.load(f)
config["inference"] = True
# init tokenizer
if "lower_case" in config.keys():
if "lower_case" in config:
logger.warning(
"Loading tokenizer from deprecated config. "
"If you used `custom_vocab` or `never_split_chars`, this won't work anymore."
Expand Down Expand Up @@ -1249,7 +1249,7 @@ def _combine_title_context(titles: List[str], texts: List[str]):
"Couldn't find title although `embed_title` is set to True for DPR. Using title='' now. Related passage text: '%s' ",
ctx,
)
res.append(tuple((title, ctx)))
res.append((title, ctx))
return res


Expand Down Expand Up @@ -1762,7 +1762,7 @@ def _combine_meta_context(meta_fields: List[str], texts: List[str]):
for meta, ctx in zip(meta_fields, texts):
if meta is None:
meta = ""
res.append(tuple((meta, ctx)))
res.append((meta, ctx))
return res


Expand Down Expand Up @@ -2111,12 +2111,12 @@ def dataset_from_dicts(
truncation=True,
max_length=self.max_seq_len,
)
names = [key for key in tokens]
names = list(tokens)
inputs = [tokens[key] for key in tokens]
if not "padding_mask" in names:
if "padding_mask" not in names:
index = names.index("attention_mask")
names[index] = "padding_mask"
if not "segment_ids" in names:
if "segment_ids" not in names:
index = names.index("token_type_ids")
names[index] = "segment_ids"

Expand Down Expand Up @@ -2149,7 +2149,7 @@ def write_squad_predictions(predictions, out_filename, predictions_filename=None
dev_labels[q["id"]] = "is_impossible"
else:
dev_labels[q["id"]] = q["answers"][0]["text"]
not_included = set(list(dev_labels.keys())) - set(list(predictions_json.keys()))
not_included = dev_labels.keys() - predictions_json.keys()
if len(not_included) > 0:
logger.info("There were missing predictions for question ids: %s", list(not_included))
for x in not_included:
Expand Down
2 changes: 1 addition & 1 deletion haystack/modeling/model/biadaptive_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,7 +322,7 @@ def forward(
output2 = None

embedding1, embedding2 = head(output1, output2)
all_logits.append(tuple([embedding1, embedding2]))
all_logits.append((embedding1, embedding2))
else:
# just return LM output (e.g. useful for extracting embeddings at inference time)
all_logits.append((pooled_output))
Expand Down
8 changes: 4 additions & 4 deletions haystack/modeling/model/language_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,14 +108,14 @@ def output_dims(self):
if self._output_dims:
return self._output_dims

for odn in OUTPUT_DIM_NAMES:
try:
try:
for odn in OUTPUT_DIM_NAMES:
value = getattr(self.model.config, odn, None)
if value:
self._output_dims = value
return value
except AttributeError:
raise ModelingError("Can't get the output dimension before loading the model.")
except AttributeError:
raise ModelingError("Can't get the output dimension before loading the model.")

raise ModelingError("Could not infer the output dimensions of the language model.")

Expand Down
2 changes: 1 addition & 1 deletion haystack/modeling/model/prediction_head.py
Original file line number Diff line number Diff line change
Expand Up @@ -732,7 +732,7 @@ def aggregate_preds(self, preds, passage_start_t, ids, seq_2_start_t=None, label
all_basket_labels = {k: self.reduce_labels(v) for k, v in all_basket_labels.items()}

# Return aggregated predictions in order as a list of lists
keys = [k for k in all_basket_preds]
keys = list(all_basket_preds)
aggregated_preds = [all_basket_preds[k] for k in keys]
if labels:
labels = [all_basket_labels[k] for k in keys]
Expand Down
2 changes: 1 addition & 1 deletion haystack/modeling/model/triadaptive_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,7 @@ def forward(self, **kwargs):
output2 = None

embedding1, embedding2 = head(output1, output2)
all_logits.append(tuple([embedding1, embedding2]))
all_logits.append((embedding1, embedding2))
else:
# just return LM output (e.g. useful for extracting embeddings at inference time)
all_logits.append((pooled_output))
Expand Down
4 changes: 2 additions & 2 deletions haystack/nodes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def name(self, value: str):
@property
def utilized_components(self) -> List[BaseComponent]:
if "params" not in self._component_config:
return list()
return []
return [param for param in self._component_config["params"].values() if isinstance(param, BaseComponent)]

@property
Expand Down Expand Up @@ -229,7 +229,7 @@ def _dispatch_run_general(self, run_method: Callable, **kwargs):
if "debug" in value.keys():
self.debug = value.pop("debug")

for _k, _v in value.items():
for _k in value.keys():
if _k not in run_signature_args:
raise Exception(f"Invalid parameter '{_k}' for the node '{self.name}'.")

Expand Down
2 changes: 1 addition & 1 deletion haystack/nodes/document_classifier/transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ def predict(self, documents: List[Document], batch_size: Optional[int] = None) -
formatted_prediction = {
"label": prediction["labels"][0],
"score": prediction["scores"][0],
"details": {label: score for label, score in zip(prediction["labels"], prediction["scores"])},
"details": dict(zip(prediction["labels"], prediction["scores"])),
}
elif self.task == "text-classification":
formatted_prediction = {
Expand Down
2 changes: 1 addition & 1 deletion haystack/nodes/file_converter/azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ def _convert_tables_and_text(
if not isinstance(table.content, pd.DataFrame):
raise HaystackError("Document's content field must be of type 'pd.DataFrame'.")
for _, row in table.content.iterrows():
for _, cell in row.items():
for cell in row.values():
file_text += f" {cell}"
if not self.validate_language(file_text, valid_languages):
logger.warning(
Expand Down
5 changes: 2 additions & 3 deletions haystack/nodes/file_converter/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,15 +202,14 @@ def run( # type: ignore
for file_path, file_meta in tqdm(
zip(file_paths, meta), total=len(file_paths), disable=not self.progress_bar, desc="Converting files"
):
for doc in self.convert(
documents += self.convert(
file_path=file_path,
meta=file_meta,
remove_numeric_tables=remove_numeric_tables,
valid_languages=valid_languages,
encoding=encoding,
id_hash_keys=id_hash_keys,
):
documents.append(doc)
)

# Cleanup ligatures
for document in documents:
Expand Down
2 changes: 1 addition & 1 deletion haystack/nodes/file_converter/parsr.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ def convert(
if not isinstance(table.content, pd.DataFrame):
raise HaystackError("Document's content field must be of type 'pd.DataFrame'.")
for _, row in table.content.iterrows():
for _, cell in row.items():
for cell in row.values():
file_text += f" {cell}"
if not self.validate_language(file_text, valid_languages):
logger.warning(
Expand Down
2 changes: 1 addition & 1 deletion haystack/nodes/file_converter/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,7 @@ def _read_pdf(
document += page.get_text("text", textpage=partial_tp, sort=sort_by_position) + "\f"
else:
cpu = cpu_count() if isinstance(multiprocessing, bool) else multiprocessing
page_list = [i for i in range(start_page, end_page)]
page_list = list(range(start_page, end_page))
cpu = cpu if len(page_list) > cpu else len(page_list)
parts = divide(cpu, page_list)
pages_mp = [(i, file_path, parts, sort_by_position, ocr, ocr_language) for i in range(cpu)]
Expand Down
2 changes: 1 addition & 1 deletion haystack/nodes/other/join_docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def run_accumulated(self, inputs: List[dict], top_k_join: Optional[int] = None):
"score would be `-infinity`."
)
else:
sorted_docs = [(k, v) for k, v in scores_map.items()]
sorted_docs = list(scores_map.items())

if not top_k_join:
top_k_join = self.top_k_join
Expand Down
2 changes: 1 addition & 1 deletion haystack/nodes/query_classifier/transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def __init__(

self.labels = labels
if task == "text-classification":
labels_from_model = [label for label in self.model.model.config.id2label.values()]
labels_from_model = list(self.model.model.config.id2label.values())
if set(labels) != set(labels_from_model):
raise ValueError(
f"For text-classification, the provided labels must match the model labels; only the order can differ.\n"
Expand Down
2 changes: 1 addition & 1 deletion haystack/nodes/ranker/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def _add_meta_fields_to_docs(
for key in embed_meta_fields:
if key in doc.meta and doc.meta[key]:
if isinstance(doc.meta[key], list):
meta_data_fields.extend([item for item in doc.meta[key]])
meta_data_fields.extend(list(doc.meta[key]))
else:
meta_data_fields.append(doc.meta[key])
# Convert to type string (e.g. for ints or floats)
Expand Down
4 changes: 2 additions & 2 deletions haystack/nodes/reader/farm.py
Original file line number Diff line number Diff line change
Expand Up @@ -1153,10 +1153,10 @@ def eval(
}

# Get rid of the question key again (after we aggregated we don't need it anymore)
d[str(doc_id)]["qas"] = [v for v in aggregated_per_question.values()]
d[str(doc_id)]["qas"] = list(aggregated_per_question.values())

# Convert input format for FARM
farm_input = [v for v in d.values()]
farm_input = list(d.values())
n_queries = len([y for x in farm_input for y in x["qas"]])

# Create DataLoader that can be passed to the Evaluator
Expand Down
2 changes: 1 addition & 1 deletion haystack/nodes/retriever/_embedding_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -394,7 +394,7 @@ def embed(self, model: str, text: List[str]) -> np.ndarray:
raise CohereUnauthorizedError(f"Invalid Cohere API key. {response.text}")
if response.status_code != 200:
raise CohereError(response.text, status_code=response.status_code)
generated_embeddings = [e for e in res["embeddings"]]
generated_embeddings = list(res["embeddings"])
return np.array(generated_embeddings)

def embed_batch(self, text: List[str]) -> np.ndarray:
Expand Down
6 changes: 3 additions & 3 deletions haystack/nodes/retriever/dense.py
Original file line number Diff line number Diff line change
Expand Up @@ -484,7 +484,7 @@ def _get_predictions(self, dicts: List[Dict[str, Any]]) -> Dict[str, np.ndarray]
:return: dictionary of embeddings for "passages" and "query"
"""
dataset, tensor_names, _, _ = self.processor.dataset_from_dicts(
dicts, indices=[i for i in range(len(dicts))], return_baskets=True
dicts, indices=list(range(len(dicts))), return_baskets=True
)

data_loader = NamedDataLoader(
Expand Down Expand Up @@ -1113,7 +1113,7 @@ def _get_predictions(self, dicts: List[Dict[str, Any]]) -> Dict[str, np.ndarray]
"""

dataset, tensor_names, _, _ = self.processor.dataset_from_dicts(
dicts, indices=[i for i in range(len(dicts))], return_baskets=True
dicts, indices=list(range(len(dicts))), return_baskets=True
)

data_loader = NamedDataLoader(
Expand Down Expand Up @@ -1862,7 +1862,7 @@ def _preprocess_documents(self, docs: List[Document]) -> List[Document]:
for key in self.embed_meta_fields:
if key in doc.meta and doc.meta[key]:
if isinstance(doc.meta[key], list):
meta_data_fields.extend([item for item in doc.meta[key]])
meta_data_fields.extend(list(doc.meta[key]))
else:
meta_data_fields.append(doc.meta[key])
# Convert to type string (e.g. for ints or floats)
Expand Down
5 changes: 1 addition & 4 deletions haystack/nodes/retriever/sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -457,10 +457,7 @@ def _get_all_paragraphs(self, document_store: BaseDocumentStore, index: Optional
def _calc_scores(self, queries: List[str], index: str) -> List[Dict[int, float]]:
question_vector = self.vectorizer.transform(queries)
doc_scores_per_query = self.tfidf_matrices[index].dot(question_vector.T).T.toarray()
doc_scores_per_query = [
[(doc_idx, doc_score) for doc_idx, doc_score in enumerate(doc_scores)]
for doc_scores in doc_scores_per_query
]
doc_scores_per_query = [list(enumerate(doc_scores)) for doc_scores in doc_scores_per_query]
indices_and_scores: List[Dict] = [
OrderedDict(sorted(query_idx_scores, key=lambda tup: tup[1], reverse=True))
for query_idx_scores in doc_scores_per_query
Expand Down
2 changes: 1 addition & 1 deletion haystack/nodes/translator/transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ def translate(
return translated_texts[0]
elif documents:
if isinstance(documents, list) and isinstance(documents[0], str):
return [translated_text for translated_text in translated_texts]
return list(translated_texts)

translated_documents: Union[
List[Document], List[Answer], List[str], List[Dict[str, Any]]
Expand Down
8 changes: 4 additions & 4 deletions haystack/pipelines/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,7 @@ def save_to_deepset_cloud(
index_config = index_pipeline.get_config()
pipelines = query_config["pipelines"] + index_config["pipelines"]
all_components = query_config["components"] + index_config["components"]
distinct_components = [c for c in {component["name"]: component for component in all_components}.values()]
distinct_components = list({component["name"]: component for component in all_components}.values())
document_stores = [c for c in distinct_components if c["type"].endswith("DocumentStore")]
for document_store in document_stores:
if document_store["type"] != "DeepsetCloudDocumentStore":
Expand Down Expand Up @@ -827,10 +827,10 @@ def eval_beir(
logger.info("Cropping dataset from %s to %s documents", len(corpus), num_documents)
corpus = dict(itertools.islice(corpus.items(), num_documents))
# Remove queries that don't contain the remaining documents
corpus_ids = set(list(corpus.keys()))
corpus_ids = set(corpus.keys())
qrels_new = {}
for query_id, document_rel_dict in qrels.items():
document_rel_ids_intersection = list(corpus_ids & set(list(document_rel_dict.keys())))
document_rel_ids_intersection = list(corpus_ids & set(document_rel_dict.keys()))
# If there are no remaining documents related to the query, delete the query
if len(document_rel_ids_intersection) == 0:
del queries[query_id]
Expand Down Expand Up @@ -1957,7 +1957,7 @@ def get_document_store(self) -> Optional[BaseDocumentStore]:
matches = self.get_nodes_by_class(class_type=BaseDocumentStore)
if len(matches) == 0:
matches = list(
set(retriever.document_store for retriever in self.get_nodes_by_class(class_type=BaseRetriever))
{retriever.document_store for retriever in self.get_nodes_by_class(class_type=BaseRetriever)}
)

if len(matches) > 1:
Expand Down
Loading

0 comments on commit 91ab90a

Please sign in to comment.