From 88727986ffa91662593958023be8ac3ccef2cab0 Mon Sep 17 00:00:00 2001 From: Avi Asulin Date: Sun, 7 Apr 2024 21:10:55 +0300 Subject: [PATCH 1/8] [onnx utils] update onnx utils packages --- onnx_utils/function.yaml | 42 +++++++++++++++++------------------ onnx_utils/item.yaml | 16 ++++++------- onnx_utils/requirements.txt | 12 +++++----- onnx_utils/test_onnx_utils.py | 4 ++-- 4 files changed, 36 insertions(+), 38 deletions(-) diff --git a/onnx_utils/function.yaml b/onnx_utils/function.yaml index 7a0054c4d..88f810fb4 100644 --- a/onnx_utils/function.yaml +++ b/onnx_utils/function.yaml @@ -2,7 +2,7 @@ kind: job metadata: name: onnx-utils tag: '' - hash: 0c4a6491b976d5220d3ebfb83a3905dd47e86be2 + hash: fd6cd909ef6e055c348b44a0313e190513cd755b project: '' labels: author: guyl @@ -16,16 +16,16 @@ spec: functionSourceCode: # Copyright 2019 Iguazio
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import Any, Callable, Dict, List, Tuple

import mlrun


class _ToONNXConversions:
    """
    An ONNX conversion functions library class.
    """

    @staticmethod
    def tf_keras_to_onnx(
        model_handler,
        onnx_model_name: str = None,
        optimize_model: bool = True,
        input_signature: List[Tuple[Tuple[int], str]] = None,
    ):
        """
        Convert a TF.Keras model to an ONNX model and log it back to MLRun as a new model object.

        :param model_handler:   An initialized TFKerasModelHandler with a loaded model to convert to ONNX.
        :param onnx_model_name: The name to use to log the converted ONNX model. If not given, the given `model_name`
                                will be used with an additional suffix `_onnx`. Defaulted to None.
        :param optimize_model:  Whether or not to optimize the ONNX model using 'onnxoptimizer' before saving the model.
                                Defaulted to True.
        :param input_signature: A list of the input layers shape and data type properties. Expected to receive a list
                                where each element is an input layer tuple. An input layer tuple is a tuple of:
                                [0] = Layer's shape, a tuple of integers.
                                [1] = Layer's data type, a mlrun.data_types.ValueType string.
                                If None, the input signature will be tried to be read from the model artifact. Defaulted
                                to None.
        """
        # Import the framework and handler:
        import tensorflow as tf
        from mlrun.frameworks.tf_keras import TFKerasUtils

        # Check the given 'input_signature' parameter:
        if input_signature is None:
            # Read the inputs from the model:
            try:
                model_handler.read_inputs_from_model()
            except Exception as error:
                raise mlrun.errors.MLRunRuntimeError(
                    f"Please provide the 'input_signature' parameter. The function tried reading the input layers "
                    f"information automatically but failed with the following error: {error}"
                )
        else:
            # Parse the 'input_signature' parameter:
            input_signature = [
                tf.TensorSpec(
                    shape=shape,
                    dtype=TFKerasUtils.convert_value_type_to_tf_dtype(
                        value_type=value_type
                    ),
                )
                for (shape, value_type) in input_signature
            ]

        # Convert to ONNX:
        model_handler.to_onnx(
            model_name=onnx_model_name,
            input_signature=input_signature,
            optimize=optimize_model,
        )

    @staticmethod
    def pytorch_to_onnx(
        model_handler,
        onnx_model_name: str = None,
        optimize_model: bool = True,
        input_signature: List[Tuple[Tuple[int, ...], str]] = None,
        input_layers_names: List[str] = None,
        output_layers_names: List[str] = None,
        dynamic_axes: Dict[str, Dict[int, str]] = None,
        is_batched: bool = True,
    ):
        """
        Convert a PyTorch model to an ONNX model and log it back to MLRun as a new model object.

        :param model_handler:       An initialized PyTorchModelHandler with a loaded model to convert to ONNX.
        :param onnx_model_name:     The name to use to log the converted ONNX model. If not given, the given
                                    `model_name` will be used with an additional suffix `_onnx`. Defaulted to None.
        :param optimize_model:      Whether or not to optimize the ONNX model using 'onnxoptimizer' before saving the
                                    model. Defaulted to True.
        :param input_signature:     A list of the input layers shape and data type properties. Expected to receive a
                                    list where each element is an input layer tuple. An input layer tuple is a tuple of:
                                    [0] = Layer's shape, a tuple of integers.
                                    [1] = Layer's data type, a mlrun.data_types.ValueType string.
                                    If None, the input signature will be tried to be read from the model artifact.
                                    Defaulted to None.
        :param input_layers_names:  List of names to assign to the input nodes of the graph in order. All of the other
                                    parameters (inner layers) can be set as well by passing additional names in the
                                    list. The order is by the order of the parameters in the model. If None, the inputs
                                    will be read from the handler's inputs. If its also None, it is defaulted to:
                                    "input_0", "input_1", ...
        :param output_layers_names: List of names to assign to the output nodes of the graph in order. If None, the
                                    outputs will be read from the handler's outputs. If its also None, it is defaulted
                                    to: "output_0" (for multiple outputs, this parameter must be provided).
        :param dynamic_axes:        If part of the input / output shape is dynamic, like (batch_size, 3, 32, 32) you can
                                    specify it by giving a dynamic axis to the input / output layer by its name as
                                    follows: {
                                        "input layer name": {0: "batch_size"},
                                        "output layer name": {0: "batch_size"},
                                    }
                                    If provided, the 'is_batched' flag will be ignored. Defaulted to None.
        :param is_batched:          Whether to include a batch size as the first axis in every input and output layer.
                                    Defaulted to True. Will be ignored if 'dynamic_axes' is provided.
        """
        # Import the framework and handler:
        import torch
        from mlrun.frameworks.pytorch import PyTorchUtils

        # Parse the 'input_signature' parameter:
        if input_signature is not None:
            input_signature = tuple(
                [
                    torch.zeros(
                        size=shape,
                        dtype=PyTorchUtils.convert_value_type_to_torch_dtype(
                            value_type=value_type
                        ),
                    )
                    for (shape, value_type) in input_signature
                ]
            )

        # Convert to ONNX:
        model_handler.to_onnx(
            model_name=onnx_model_name,
            input_sample=input_signature,
            optimize=optimize_model,
            input_layers_names=input_layers_names,
            output_layers_names=output_layers_names,
            dynamic_axes=dynamic_axes,
            is_batched=is_batched
        )


# Map for getting the conversion function according to the provided framework:
_CONVERSION_MAP = {
    "tensorflow.keras": _ToONNXConversions.tf_keras_to_onnx,
    "torch": _ToONNXConversions.pytorch_to_onnx,
}  # type: Dict[str, Callable]


def to_onnx(
    context: mlrun.MLClientCtx,
    model_path: str,
    onnx_model_name: str = None,
    optimize_model: bool = True,
    framework_kwargs: Dict[str, Any] = None,
):
    """
    Convert the given model to an ONNX model.

    :param context:          The MLRun function execution context
    :param model_path:       The model path store object.
    :param onnx_model_name:  The name to use to log the converted ONNX model. If not given, the given `model_name` will
                             be used with an additional suffix `_onnx`. Defaulted to None.
    :param optimize_model:   Whether to optimize the ONNX model using 'onnxoptimizer' before saving the model. Defaulted
                             to True.
    :param framework_kwargs: Additional arguments each framework may require in order to convert to ONNX. To get the doc
                             string of the desired framework onnx conversion function, pass "help".
    """
    from mlrun.frameworks.auto_mlrun.auto_mlrun import AutoMLRun

    # Get a model handler of the required framework:
    model_handler = AutoMLRun.load_model(model_path=model_path, context=context)

    # Get the model's framework:
    framework = model_handler.FRAMEWORK_NAME

    # Use the conversion map to get the specific framework to onnx conversion:
    if framework not in _CONVERSION_MAP:
        raise mlrun.errors.MLRunInvalidArgumentError(
            f"The following framework: '{framework}', has no ONNX conversion."
        )
    conversion_function = _CONVERSION_MAP[framework]

    # Check if needed to print the function's doc string ("help" is passed):
    if framework_kwargs == "help":
        print(conversion_function.__doc__)
        return

    # Set the default empty framework kwargs if needed:
    if framework_kwargs is None:
        framework_kwargs = {}

    # Run the conversion:
    try:
        conversion_function(
            model_handler=model_handler,
            onnx_model_name=onnx_model_name,
            optimize_model=optimize_model,
            **framework_kwargs,
        )
    except TypeError as exception:
        raise mlrun.errors.MLRunInvalidArgumentError(
            f"ERROR: A TypeError exception was raised during the conversion:\n{exception}. "
            f"Please read the {framework} framework conversion function doc string by passing 'help' in the "
            f"'framework_kwargs' dictionary parameter."
        )


def optimize(
    context: mlrun.MLClientCtx,
    model_path: str,
    optimizations: List[str] = None,
    fixed_point: bool = False,
    optimized_model_name: str = None,
):
    """
    Optimize the given ONNX model.

    :param context:              The MLRun function execution context.
    :param model_path:           Path to the ONNX model object.
    :param optimizations:        List of possible optimizations. To see what optimizations are available, pass "help".
                                 If None, all of the optimizations will be used. Defaulted to None.
    :param fixed_point:          Optimize the weights using fixed point. Defaulted to False.
    :param optimized_model_name: The name of the optimized model. If None, the original model will be overridden.
                                 Defaulted to None.
    """
    # Import the model handler:
    import onnxoptimizer
    from mlrun.frameworks.onnx import ONNXModelHandler

    # Check if needed to print the available optimizations ("help" is passed):
    if optimizations == "help":
        available_passes = "\n* ".join(onnxoptimizer.get_available_passes())
        print(f"The available optimizations are:\n* {available_passes}")
        return

    # Create the model handler:
    model_handler = ONNXModelHandler(
        model_path=model_path, context=context
    )

    # Load the ONNX model:
    model_handler.load()

    # Optimize the model using the given configurations:
    model_handler.optimize(optimizations=optimizations, fixed_point=fixed_point)

    # Rename if needed:
    if optimized_model_name is not None:
        model_handler.set_model_name(model_name=optimized_model_name)

    # Log the optimized model:
    model_handler.log()
 base_image: mlrun/mlrun commands: [] - code_origin: https://github.com/yonishelach/functions.git#f84b9565a33d8159315992ebba5838d41f6cc112:/Users/Yonatan_Shelach/projects/functions/onnx_utils/onnx_utils.py - origin_filename: /Users/Yonatan_Shelach/projects/functions/onnx_utils/onnx_utils.py + code_origin: '' + origin_filename: '' with_mlrun: false auto_build: true requirements: - - onnx~=1.13.0 - - onnxruntime~=1.14.0 - - onnxoptimizer~=0.3.0 - - onnxmltools~=1.11.0 - - tf2onnx~=1.13.0 + - onnx~=1.15.0 + - onnxruntime~=1.8.1 + - onnxoptimizer~=0.2.0 + - onnxmltools~=1.9.0 + - tf2onnx~=1.16.0 entry_points: tf_keras_to_onnx: name: tf_keras_to_onnx @@ -35,7 +35,6 @@ spec: - name: model_handler doc: An initialized TFKerasModelHandler with a loaded model to convert to ONNX. - default: '' - name: onnx_model_name type: str doc: The name to use to log the converted ONNX model. If not given, the given @@ -55,9 +54,10 @@ spec: data type, a mlrun.data_types.ValueType string. If None, the input signature will be tried to be read from the model artifact. Defaulted to None.' default: null - outputs: - - default: '' + outputs: [] lineno: 26 + has_varargs: false + has_kwargs: false pytorch_to_onnx: name: pytorch_to_onnx doc: Convert a PyTorch model to an ONNX model and log it back to MLRun as a @@ -66,7 +66,6 @@ spec: - name: model_handler doc: An initialized PyTorchModelHandler with a loaded model to convert to ONNX. - default: '' - name: onnx_model_name type: str doc: The name to use to log the converted ONNX model. If not given, the given @@ -114,9 +113,10 @@ spec: doc: Whether to include a batch size as the first axis in every input and output layer. Defaulted to True. Will be ignored if 'dynamic_axes' is provided. default: true - outputs: - - default: '' + outputs: [] lineno: 81 + has_varargs: false + has_kwargs: false to_onnx: name: to_onnx doc: Convert the given model to an ONNX model. @@ -124,11 +124,9 @@ spec: - name: context type: MLClientCtx doc: The MLRun function execution context - default: '' - name: model_path type: str doc: The model path store object. - default: '' - name: onnx_model_name type: str doc: The name to use to log the converted ONNX model. If not given, the given @@ -146,9 +144,10 @@ spec: ONNX. To get the doc string of the desired framework onnx conversion function, pass "help". default: null - outputs: - - default: '' + outputs: [] lineno: 160 + has_varargs: false + has_kwargs: false optimize: name: optimize doc: Optimize the given ONNX model. @@ -156,11 +155,9 @@ spec: - name: context type: MLClientCtx doc: The MLRun function execution context. - default: '' - name: model_path type: str doc: Path to the ONNX model object. - default: '' - name: optimizations type: List[str] doc: List of possible optimizations. To see what optimizations are available, @@ -176,9 +173,10 @@ spec: doc: The name of the optimized model. If None, the original model will be overridden. Defaulted to None. default: null - outputs: - - default: '' + outputs: [] lineno: 219 + has_varargs: false + has_kwargs: false description: ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun. default_handler: to_onnx diff --git a/onnx_utils/item.yaml b/onnx_utils/item.yaml index 36335837a..84486d9f8 100644 --- a/onnx_utils/item.yaml +++ b/onnx_utils/item.yaml @@ -12,9 +12,9 @@ labels: author: guyl maintainers: [] marketplaceType: '' -mlrunVersion: 1.1.0 +mlrunVersion: 1.6.3 name: onnx_utils -platformVersion: 3.5.0 +platformVersion: 3.5.4 spec: extra_spec: allow_empty_resources: true @@ -26,10 +26,10 @@ spec: image: mlrun/mlrun kind: job requirements: - - onnx~=1.13.0 - - onnxruntime~=1.14.0 - - onnxoptimizer~=0.3.0 - - onnxmltools~=1.11.0 - - tf2onnx~=1.13.0 + - onnx~=1.15.0 + - onnxruntime~=1.8.1 + - onnxoptimizer~=0.2.0 + - onnxmltools~=1.9.0 + - tf2onnx~=1.16.0 url: '' -version: 1.2.0 +version: 1.3.0 diff --git a/onnx_utils/requirements.txt b/onnx_utils/requirements.txt index dc7ff1e7b..a9acd7371 100644 --- a/onnx_utils/requirements.txt +++ b/onnx_utils/requirements.txt @@ -1,11 +1,11 @@ tqdm~=4.62.3 -tensorflow~=2.7.0 +tensorflow~=2.13.0 torch~=1.10.0 torchvision~=0.11.1 -onnx~=1.10.1 -onnxruntime~=1.8.1 -onnxoptimizer~=0.2.0 +onnx~=1.15.0 +onnxruntime~=1.12.1 +onnxoptimizer~=0.3.0 onnxmltools~=1.9.0 -tf2onnx~=1.9.0 +tf2onnx~=1.16.0 plotly~=5.4.0 -wrapt<1.15.0 # wrapt==1.15.0 fails tensorflow 2.7 Todo: please remove when updating tensorflow \ No newline at end of file +#wrapt<1.15.0 # wrapt==1.15.0 fails tensorflow 2.7 Todo: please remove when updating tensorflow \ No newline at end of file diff --git a/onnx_utils/test_onnx_utils.py b/onnx_utils/test_onnx_utils.py index 35b224c4a..aaae96372 100644 --- a/onnx_utils/test_onnx_utils.py +++ b/onnx_utils/test_onnx_utils.py @@ -257,7 +257,7 @@ def test_pytorch_to_onnx(): filename="test_onnx_utils.py", name="log_model", kind="job", - image="mlrun/ml-models", + image="mlrun/mlrun", ) # Run the function to log the model: @@ -341,7 +341,7 @@ def test_optimize(): filename="test_onnx_utils.py", name="log_model", kind="job", - image="mlrun/ml-models", + image="mlrun/mlrun", ) # Run the function to log the model: From 6fdc1b1adb171283d70b2e5458d0c943d93c4f8e Mon Sep 17 00:00:00 2001 From: Avi Asulin Date: Thu, 29 Aug 2024 13:25:11 +0300 Subject: [PATCH 2/8] [feature selection] update function yaml --- feature_selection/function.yaml | 75 +++++++++++++-------------------- feature_selection/item.yaml | 6 +-- 2 files changed, 32 insertions(+), 49 deletions(-) diff --git a/feature_selection/function.yaml b/feature_selection/function.yaml index 0851f54d3..d4e95f1e9 100644 --- a/feature_selection/function.yaml +++ b/feature_selection/function.yaml @@ -1,58 +1,33 @@ kind: job -metadata: - name: feature-selection - tag: '' - hash: 6dba16d062d81f78d3d210fee75edfe8b1def9b3 - project: '' - labels: - author: orz - categories: - - data-preparation - - machine-learning +verbose: false spec: + disable_auto_mount: false command: '' - args: [] - image: mlrun/mlrun - build: - functionSourceCode: # Copyright 2019 Iguazio
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import json
import os

import matplotlib.pyplot as plt
import mlrun
import mlrun.datastore
import mlrun.utils
import mlrun.feature_store as fs
import numpy as np
import pandas as pd
import seaborn as sns
from mlrun.artifacts import PlotArtifact
from mlrun.datastore.targets import ParquetTarget
# MLRun utils
from mlrun.utils.helpers import create_class
# Feature selection strategies
from sklearn.feature_selection import SelectFromModel, SelectKBest
# Scale feature scoresgit st
from sklearn.preprocessing import MinMaxScaler
# SKLearn estimators list
from sklearn.utils import all_estimators

DEFAULT_STAT_FILTERS = ["f_classif", "mutual_info_classif", "chi2", "f_regression"]
DEFAULT_MODEL_FILTERS = {
    "LinearSVC": "LinearSVC",
    "LogisticRegression": "LogisticRegression",
    "ExtraTreesClassifier": "ExtraTreesClassifier",
}


def _clear_current_figure():
    """
    Clear matplotlib current figure.
    """
    plt.cla()
    plt.clf()
    plt.close()


def show_values_on_bars(axs, h_v="v", space=0.4):
    def _show_on_single_plot(ax_):
        if h_v == "v":
            for p in ax_.patches:
                _x = p.get_x() + p.get_width() / 2
                _y = p.get_y() + p.get_height()
                value = int(p.get_height())
                ax_.text(_x, _y, value, ha="center")
        elif h_v == "h":
            for p in ax_.patches:
                _x = p.get_x() + p.get_width() + float(space)
                _y = p.get_y() + p.get_height()
                value = int(p.get_width())
                ax_.text(_x, _y, value, ha="left")

    if isinstance(axs, np.ndarray):
        for idx, ax in np.ndenumerate(axs):
            _show_on_single_plot(ax)
    else:
        _show_on_single_plot(axs)


def plot_stat(context, stat_name, stat_df):
    _clear_current_figure()

    # Add chart
    ax = plt.axes()
    stat_chart = sns.barplot(
        x=stat_name,
        y="index",
        data=stat_df.sort_values(stat_name, ascending=False).reset_index(),
        ax=ax,
    )
    plt.tight_layout()

    for p in stat_chart.patches:
        width = p.get_width()
        plt.text(
            5 + p.get_width(),
            p.get_y() + 0.55 * p.get_height(),
            "{:1.2f}".format(width),
            ha="center",
            va="center",
        )

    context.log_artifact(
        PlotArtifact(f"{stat_name}", body=plt.gcf()),
        local_path=os.path.join("plots", "feature_selection", f"{stat_name}.html"),
    )
    _clear_current_figure()


def feature_selection(
    context,
    df_artifact,
    k: int = 5,
    min_votes: float = 0.5,
    label_column: str = None,
    stat_filters: list = None,
    model_filters: dict = None,
    max_scaled_scores: bool = True,
    sample_ratio: float = None,
    output_vector_name: float = None,
    ignore_type_errors: bool = False,
    is_feature_vector: bool = False,
):
    """
    Applies selected feature selection statistical functions or models on our 'df_artifact'.

    Each statistical function or model will vote for it's best K selected features.
    If a feature has >= 'min_votes' votes, it will be selected.

    :param context:             the function context.
    :param df_artifact:         dataframe to pass as input.
    :param k:                   number of top features to select from each statistical
                                function or model.
    :param min_votes:           minimal number of votes (from a model or by statistical
                                function) needed for a feature to be selected.
                                Can be specified by percentage of votes or absolute
                                number of votes.
    :param label_column:        ground-truth (y) labels.
    :param stat_filters:        statistical functions to apply to the features
                                (from sklearn.feature_selection).
    :param model_filters:       models to use for feature evaluation, can be specified by
                                model name (ex. LinearSVC), formalized json (contains 'CLASS',
                                'FIT', 'META') or a path to such json file.
    :param max_scaled_scores:   produce feature scores table scaled with max_scaler.
    :param sample_ratio:        percentage of the dataset the user whishes to compute the feature selection process on.
    :param output_vector_name:  creates a new feature vector containing only the identifies features.
    :param ignore_type_errors:  skips datatypes that are neither float nor int within the feature vector.
    :param is_feature_vector:   bool stating if the data is passed as a feature vector.
    """
    stat_filters = stat_filters or DEFAULT_STAT_FILTERS
    model_filters = model_filters or DEFAULT_MODEL_FILTERS
    # Check if df.meta is valid, if it is, look for a feature vector
    store_uri_prefix, _ = mlrun.datastore.parse_store_uri(df_artifact.artifact_url)
    is_feature_vector = mlrun.utils.StorePrefix.FeatureVector == store_uri_prefix

    # Look inside meta.spec.label_feature to identify the label_column if the user did not specify it
    if label_column is None:
        if is_feature_vector:
            label_column = df_artifact.meta.spec.label_feature.split(".")[1]
        else:
            raise ValueError("No label_column was given, please add a label_column.")

    # Use the feature vector as dataframe
    df = df_artifact.as_df()

    # Ensure k is not bigger than the total number of features
    if k > df.shape[1]:
        raise ValueError(
            f"K cannot be bigger than the total number of features ({df.shape[1]}). Please choose a smaller K."
        )
    elif k < 1:
        raise ValueError("K cannot be smaller than 1. Please choose a bigger K.")

    # Create a sample dataframe of the original feature vector
    if sample_ratio:
        df = (
            df.groupby(label_column)
            .apply(lambda x: x.sample(frac=sample_ratio))
            .reset_index(drop=True)
        )
        df = df.dropna()

    # Set feature vector and labels
    y = df.pop(label_column)
    X = df

    if np.object_ in list(X.dtypes) and ignore_type_errors is False:
        raise ValueError(
            f"{df.select_dtypes(include=['object']).columns.tolist()} are neither float or int."
        )

    # Create selected statistical estimators
    stat_functions_list = {
        stat_name: SelectKBest(
            score_func=create_class(f"sklearn.feature_selection.{stat_name}"), k=k
        )
        for stat_name in stat_filters
    }
    requires_abs = ["chi2"]

    # Run statistic filters
    selected_features_agg = {}
    stats_df = pd.DataFrame(index=X.columns).dropna()

    for stat_name, stat_func in stat_functions_list.items():
        try:
            params = (X, y) if stat_name in requires_abs else (abs(X), y)
            stat = stat_func.fit(*params)

            # Collect stat function results
            stat_df = pd.DataFrame(
                index=X.columns, columns=[stat_name], data=stat.scores_
            )
            plot_stat(context, stat_name, stat_df)
            stats_df = stats_df.join(stat_df)

            # Select K Best features
            selected_features = X.columns[stat_func.get_support()]
            selected_features_agg[stat_name] = selected_features

        except Exception as e:
            context.logger.info(f"Couldn't calculate {stat_name} because of: {e}")

    # Create models from class name / json file / json params
    all_sklearn_estimators = dict(all_estimators()) if len(model_filters) > 0 else {}
    selected_models = {}
    for model_name, model in model_filters.items():
        if ".json" in model:
            current_model = json.load(open(model, "r"))
            classifier_class = create_class(current_model["META"]["class"])
            selected_models[model_name] = classifier_class(**current_model["CLASS"])
        elif model in all_sklearn_estimators:
            selected_models[model_name] = all_sklearn_estimators[model_name]()

        else:
            try:
                current_model = json.loads(model)
                classifier_class = create_class(current_model["META"]["class"])
                selected_models[model_name] = classifier_class(**current_model["CLASS"])
            except Exception as e:
                context.logger.info(f"unable to load {model} because of: {e}")

    # Run model filters
    models_df = pd.DataFrame(index=X.columns)
    for model_name, model in selected_models.items():

        if model_name == "LogisticRegression":
            model.set_params(solver="liblinear")

        # Train model and get feature importance
        select_from_model = SelectFromModel(model).fit(X, y)
        feature_idx = select_from_model.get_support()
        feature_names = X.columns[feature_idx]
        selected_features_agg[model_name] = feature_names.tolist()

        # Collect model feature importance
        if hasattr(select_from_model.estimator_, "coef_"):
            stat_df = select_from_model.estimator_.coef_
        elif hasattr(select_from_model.estimator_, "feature_importances_"):
            stat_df = select_from_model.estimator_.feature_importances_

        stat_df = pd.DataFrame(index=X.columns, columns=[model_name], data=stat_df[0])
        models_df = models_df.join(stat_df)

        plot_stat(context, model_name, stat_df)

    # Create feature_scores DF with stat & model filters scores
    result_matrix_df = pd.concat([stats_df, models_df], axis=1, sort=False)
    context.log_dataset(
        key="feature_scores",
        df=result_matrix_df,
        local_path="feature_scores.parquet",
        format="parquet",
    )
    if max_scaled_scores:
        normalized_df = result_matrix_df.replace([np.inf, -np.inf], np.nan).values
        min_max_scaler = MinMaxScaler()
        normalized_df = min_max_scaler.fit_transform(normalized_df)
        normalized_df = pd.DataFrame(
            data=normalized_df,
            columns=result_matrix_df.columns,
            index=result_matrix_df.index,
        )
        context.log_dataset(
            key="max_scaled_scores_feature_scores",
            df=normalized_df,
            local_path="max_scaled_scores_feature_scores.parquet",
            format="parquet",
        )

    # Create feature count DataFrame
    for test_name in selected_features_agg:
        result_matrix_df[test_name] = [
            1 if x in selected_features_agg[test_name] else 0 for x in X.columns
        ]
    result_matrix_df.loc[:, "num_votes"] = result_matrix_df.sum(axis=1)
    context.log_dataset(
        key="selected_features_count",
        df=result_matrix_df,
        local_path="selected_features_count.parquet",
        format="parquet",
    )

    # How many votes are needed for a feature to be selected?
    if isinstance(min_votes, int):
        votes_needed = min_votes
    else:
        num_filters = len(stat_filters) + len(model_filters)
        votes_needed = int(np.floor(num_filters * max(min(min_votes, 1), 0)))
    context.logger.info(f"votes needed to be selected: {votes_needed}")

    # Create final feature dataframe
    selected_features = result_matrix_df[
        result_matrix_df.num_votes >= votes_needed
    ].index.tolist()
    good_feature_df = df.loc[:, selected_features]
    final_df = pd.concat([good_feature_df, y], axis=1)
    context.log_dataset(
        key="selected_features",
        df=final_df,
        local_path="selected_features.parquet",
        format="parquet",
    )

    # Creating a new feature vector containing only the identified top features
    if is_feature_vector and df_artifact.meta.spec.features and output_vector_name:
        # Selecting the top K features from our top feature dataframe
        selected_features = result_matrix_df.head(k).index

        # Match the selected feature names to the FS Feature annotations
        matched_selections = [
            feature
            for feature in list(df_artifact.meta.spec.features)
            for selected in list(selected_features)
            if feature.endswith(selected)
        ]

        # Defining our new feature vector
        top_features_fv = fs.FeatureVector(
            output_vector_name,
            matched_selections,
            label_feature="labels.label",
            description="feature vector composed strictly of our top features",
        )

        # Saving
        top_features_fv.save()
        fs.get_offline_features(top_features_fv, target=ParquetTarget())

        # Logging our new feature vector URI
        context.log_result("top_features_vector", top_features_fv.uri)
 - commands: [] - code_origin: '' - origin_filename: '' - requirements: [] entry_points: show_values_on_bars: - name: show_values_on_bars - doc: '' + lineno: 54 parameters: - name: axs - name: h_v default: v - name: space default: 0.4 - outputs: [] - lineno: 54 + name: show_values_on_bars has_varargs: false has_kwargs: false - plot_stat: - name: plot_stat doc: '' + plot_stat: + lineno: 76 parameters: - name: context - name: stat_name - name: stat_df - outputs: [] - lineno: 76 + name: plot_stat has_varargs: false has_kwargs: false + doc: '' feature_selection: - name: feature_selection - doc: 'Applies selected feature selection statistical functions or models on - our ''df_artifact''. - - - Each statistical function or model will vote for it''s best K selected features. - - If a feature has >= ''min_votes'' votes, it will be selected.' + lineno: 106 parameters: - name: context doc: the function context. @@ -103,18 +78,26 @@ spec: type: bool doc: bool stating if the data is passed as a feature vector. default: false - outputs: [] - lineno: 106 + name: feature_selection has_varargs: false has_kwargs: false - description: Select features through multiple Statistical and Model filters + doc: 'Applies selected feature selection statistical functions or models on + our ''df_artifact''. + + + Each statistical function or model will vote for it''s best K selected features. + + If a feature has >= ''min_votes'' votes, it will be selected.' + build: + origin_filename: '' + functionSourceCode: # Copyright 2019 Iguazio
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import json
import os

import matplotlib.pyplot as plt
import mlrun
import mlrun.datastore
import mlrun.utils
import mlrun.feature_store as fs
import numpy as np
import pandas as pd
import seaborn as sns
from mlrun.artifacts import PlotArtifact
from mlrun.datastore.targets import ParquetTarget
# MLRun utils
from mlrun.utils.helpers import create_class
# Feature selection strategies
from sklearn.feature_selection import SelectFromModel, SelectKBest
# Scale feature scoresgit st
from sklearn.preprocessing import MinMaxScaler
# SKLearn estimators list
from sklearn.utils import all_estimators

DEFAULT_STAT_FILTERS = ["f_classif", "mutual_info_classif", "chi2", "f_regression"]
DEFAULT_MODEL_FILTERS = {
    "LinearSVC": "LinearSVC",
    "LogisticRegression": "LogisticRegression",
    "ExtraTreesClassifier": "ExtraTreesClassifier",
}


def _clear_current_figure():
    """
    Clear matplotlib current figure.
    """
    plt.cla()
    plt.clf()
    plt.close()


def show_values_on_bars(axs, h_v="v", space=0.4):
    def _show_on_single_plot(ax_):
        if h_v == "v":
            for p in ax_.patches:
                _x = p.get_x() + p.get_width() / 2
                _y = p.get_y() + p.get_height()
                value = int(p.get_height())
                ax_.text(_x, _y, value, ha="center")
        elif h_v == "h":
            for p in ax_.patches:
                _x = p.get_x() + p.get_width() + float(space)
                _y = p.get_y() + p.get_height()
                value = int(p.get_width())
                ax_.text(_x, _y, value, ha="left")

    if isinstance(axs, np.ndarray):
        for idx, ax in np.ndenumerate(axs):
            _show_on_single_plot(ax)
    else:
        _show_on_single_plot(axs)


def plot_stat(context, stat_name, stat_df):
    _clear_current_figure()

    # Add chart
    ax = plt.axes()
    stat_chart = sns.barplot(
        x=stat_name,
        y="index",
        data=stat_df.sort_values(stat_name, ascending=False).reset_index(),
        ax=ax,
    )
    plt.tight_layout()

    for p in stat_chart.patches:
        width = p.get_width()
        plt.text(
            5 + p.get_width(),
            p.get_y() + 0.55 * p.get_height(),
            "{:1.2f}".format(width),
            ha="center",
            va="center",
        )

    context.log_artifact(
        PlotArtifact(f"{stat_name}", body=plt.gcf()),
        local_path=os.path.join("plots", "feature_selection", f"{stat_name}.html"),
    )
    _clear_current_figure()


def feature_selection(
    context,
    df_artifact,
    k: int = 5,
    min_votes: float = 0.5,
    label_column: str = None,
    stat_filters: list = None,
    model_filters: dict = None,
    max_scaled_scores: bool = True,
    sample_ratio: float = None,
    output_vector_name: float = None,
    ignore_type_errors: bool = False,
    is_feature_vector: bool = False,
):
    """
    Applies selected feature selection statistical functions or models on our 'df_artifact'.

    Each statistical function or model will vote for it's best K selected features.
    If a feature has >= 'min_votes' votes, it will be selected.

    :param context:             the function context.
    :param df_artifact:         dataframe to pass as input.
    :param k:                   number of top features to select from each statistical
                                function or model.
    :param min_votes:           minimal number of votes (from a model or by statistical
                                function) needed for a feature to be selected.
                                Can be specified by percentage of votes or absolute
                                number of votes.
    :param label_column:        ground-truth (y) labels.
    :param stat_filters:        statistical functions to apply to the features
                                (from sklearn.feature_selection).
    :param model_filters:       models to use for feature evaluation, can be specified by
                                model name (ex. LinearSVC), formalized json (contains 'CLASS',
                                'FIT', 'META') or a path to such json file.
    :param max_scaled_scores:   produce feature scores table scaled with max_scaler.
    :param sample_ratio:        percentage of the dataset the user whishes to compute the feature selection process on.
    :param output_vector_name:  creates a new feature vector containing only the identifies features.
    :param ignore_type_errors:  skips datatypes that are neither float nor int within the feature vector.
    :param is_feature_vector:   bool stating if the data is passed as a feature vector.
    """
    stat_filters = stat_filters or DEFAULT_STAT_FILTERS
    model_filters = model_filters or DEFAULT_MODEL_FILTERS
    # Check if df.meta is valid, if it is, look for a feature vector
    store_uri_prefix, _ = mlrun.datastore.parse_store_uri(df_artifact.artifact_url)
    is_feature_vector = mlrun.utils.StorePrefix.FeatureVector == store_uri_prefix

    # Look inside meta.spec.label_feature to identify the label_column if the user did not specify it
    if label_column is None:
        if is_feature_vector:
            label_column = df_artifact.meta.spec.label_feature.split(".")[1]
        else:
            raise ValueError("No label_column was given, please add a label_column.")

    # Use the feature vector as dataframe
    df = df_artifact.as_df()

    # Ensure k is not bigger than the total number of features
    if k > df.shape[1]:
        raise ValueError(
            f"K cannot be bigger than the total number of features ({df.shape[1]}). Please choose a smaller K."
        )
    elif k < 1:
        raise ValueError("K cannot be smaller than 1. Please choose a bigger K.")

    # Create a sample dataframe of the original feature vector
    if sample_ratio:
        df = (
            df.groupby(label_column)
            .apply(lambda x: x.sample(frac=sample_ratio))
            .reset_index(drop=True)
        )
        df = df.dropna()

    # Set feature vector and labels
    y = df.pop(label_column)
    X = df

    if np.object_ in list(X.dtypes) and ignore_type_errors is False:
        raise ValueError(
            f"{df.select_dtypes(include=['object']).columns.tolist()} are neither float or int."
        )

    # Create selected statistical estimators
    stat_functions_list = {
        stat_name: SelectKBest(
            score_func=create_class(f"sklearn.feature_selection.{stat_name}"), k=k
        )
        for stat_name in stat_filters
    }
    requires_abs = ["chi2"]

    # Run statistic filters
    selected_features_agg = {}
    stats_df = pd.DataFrame(index=X.columns).dropna()

    for stat_name, stat_func in stat_functions_list.items():
        try:
            params = (X, y) if stat_name in requires_abs else (abs(X), y)
            stat = stat_func.fit(*params)

            # Collect stat function results
            stat_df = pd.DataFrame(
                index=X.columns, columns=[stat_name], data=stat.scores_
            )
            plot_stat(context, stat_name, stat_df)
            stats_df = stats_df.join(stat_df)

            # Select K Best features
            selected_features = X.columns[stat_func.get_support()]
            selected_features_agg[stat_name] = selected_features

        except Exception as e:
            context.logger.info(f"Couldn't calculate {stat_name} because of: {e}")

    # Create models from class name / json file / json params
    all_sklearn_estimators = dict(all_estimators()) if len(model_filters) > 0 else {}
    selected_models = {}
    for model_name, model in model_filters.items():
        if ".json" in model:
            current_model = json.load(open(model, "r"))
            classifier_class = create_class(current_model["META"]["class"])
            selected_models[model_name] = classifier_class(**current_model["CLASS"])
        elif model in all_sklearn_estimators:
            selected_models[model_name] = all_sklearn_estimators[model_name]()

        else:
            try:
                current_model = json.loads(model)
                classifier_class = create_class(current_model["META"]["class"])
                selected_models[model_name] = classifier_class(**current_model["CLASS"])
            except Exception as e:
                context.logger.info(f"unable to load {model} because of: {e}")

    # Run model filters
    models_df = pd.DataFrame(index=X.columns)
    for model_name, model in selected_models.items():

        if model_name == "LogisticRegression":
            model.set_params(solver="liblinear")

        # Train model and get feature importance
        select_from_model = SelectFromModel(model).fit(X, y)
        feature_idx = select_from_model.get_support()
        feature_names = X.columns[feature_idx]
        selected_features_agg[model_name] = feature_names.tolist()

        # Collect model feature importance
        if hasattr(select_from_model.estimator_, "coef_"):
            stat_df = select_from_model.estimator_.coef_
        elif hasattr(select_from_model.estimator_, "feature_importances_"):
            stat_df = select_from_model.estimator_.feature_importances_

        stat_df = pd.DataFrame(index=X.columns, columns=[model_name], data=stat_df[0])
        models_df = models_df.join(stat_df)

        plot_stat(context, model_name, stat_df)

    # Create feature_scores DF with stat & model filters scores
    result_matrix_df = pd.concat([stats_df, models_df], axis=1, sort=False)
    context.log_dataset(
        key="feature_scores",
        df=result_matrix_df,
        local_path="feature_scores.parquet",
        format="parquet",
    )
    if max_scaled_scores:
        normalized_df = result_matrix_df.replace([np.inf, -np.inf], np.nan).values
        min_max_scaler = MinMaxScaler()
        normalized_df = min_max_scaler.fit_transform(normalized_df)
        normalized_df = pd.DataFrame(
            data=normalized_df,
            columns=result_matrix_df.columns,
            index=result_matrix_df.index,
        )
        context.log_dataset(
            key="max_scaled_scores_feature_scores",
            df=normalized_df,
            local_path="max_scaled_scores_feature_scores.parquet",
            format="parquet",
        )

    # Create feature count DataFrame
    for test_name in selected_features_agg:
        result_matrix_df[test_name] = [
            1 if x in selected_features_agg[test_name] else 0 for x in X.columns
        ]
    result_matrix_df.loc[:, "num_votes"] = result_matrix_df.sum(axis=1)
    context.log_dataset(
        key="selected_features_count",
        df=result_matrix_df,
        local_path="selected_features_count.parquet",
        format="parquet",
    )

    # How many votes are needed for a feature to be selected?
    if isinstance(min_votes, int):
        votes_needed = min_votes
    else:
        num_filters = len(stat_filters) + len(model_filters)
        votes_needed = int(np.floor(num_filters * max(min(min_votes, 1), 0)))
    context.logger.info(f"votes needed to be selected: {votes_needed}")

    # Create final feature dataframe
    selected_features = result_matrix_df[
        result_matrix_df.num_votes >= votes_needed
    ].index.tolist()
    good_feature_df = df.loc[:, selected_features]
    final_df = pd.concat([good_feature_df, y], axis=1)
    context.log_dataset(
        key="selected_features",
        df=final_df,
        local_path="selected_features.parquet",
        format="parquet",
    )

    # Creating a new feature vector containing only the identified top features
    if is_feature_vector and df_artifact.meta.spec.features and output_vector_name:
        # Selecting the top K features from our top feature dataframe
        selected_features = result_matrix_df.head(k).index

        # Match the selected feature names to the FS Feature annotations
        matched_selections = [
            feature
            for feature in list(df_artifact.meta.spec.features)
            for selected in list(selected_features)
            if feature.endswith(selected)
        ]

        # Defining our new feature vector
        top_features_fv = fs.FeatureVector(
            output_vector_name,
            matched_selections,
            label_feature="labels.label",
            description="feature vector composed strictly of our top features",
        )

        # Saving
        top_features_fv.save()
        fs.get_offline_features(top_features_fv, target=ParquetTarget())

        # Logging our new feature vector URI
        context.log_result("top_features_vector", top_features_fv.uri)
 + code_origin: '' default_handler: feature_selection - disable_auto_mount: false - clone_target_dir: '' - env: [] - priority_class_name: '' - preemption_mode: prevent - affinity: null - tolerations: null - security_context: {} -verbose: false + description: Select features through multiple Statistical and Model filters + image: mlrun/mlrun +metadata: + categories: + - data-preparation + - machine-learning + tag: '' + name: feature-selection diff --git a/feature_selection/item.yaml b/feature_selection/item.yaml index 7e80a417b..1c79e6f12 100644 --- a/feature_selection/item.yaml +++ b/feature_selection/item.yaml @@ -12,9 +12,9 @@ labels: author: orz maintainers: [] marketplaceType: '' -mlrunVersion: 1.1.0 +mlrunVersion: 1.7.0 name: feature-selection -platformVersion: 3.5.0 +platformVersion: 3.6.0 spec: filename: feature_selection.py handler: feature_selection @@ -22,4 +22,4 @@ spec: kind: job requirements: [] url: '' -version: 1.4.0 +version: 1.5.0 From 64907af0011894ceffe6e7e8471a306eb215221f Mon Sep 17 00:00:00 2001 From: Avi Asulin Date: Mon, 2 Sep 2024 14:52:10 +0300 Subject: [PATCH 3/8] [feature selection] update function yaml --- feature_selection/function.yaml | 56 ++++++++++++++++----------------- feature_selection/item.yaml | 2 +- 2 files changed, 29 insertions(+), 29 deletions(-) diff --git a/feature_selection/function.yaml b/feature_selection/function.yaml index d4e95f1e9..f1bf53b8a 100644 --- a/feature_selection/function.yaml +++ b/feature_selection/function.yaml @@ -1,33 +1,43 @@ +metadata: + name: feature-selection + tag: '' + categories: + - data-preparation + - machine-learning kind: job -verbose: false spec: - disable_auto_mount: false - command: '' entry_points: show_values_on_bars: - lineno: 54 + doc: '' + has_kwargs: false parameters: - name: axs - name: h_v default: v - name: space default: 0.4 - name: show_values_on_bars + lineno: 54 has_varargs: false - has_kwargs: false - doc: '' + name: show_values_on_bars plot_stat: - lineno: 76 + doc: '' + has_kwargs: false parameters: - name: context - name: stat_name - name: stat_df - name: plot_stat + lineno: 76 has_varargs: false - has_kwargs: false - doc: '' + name: plot_stat feature_selection: - lineno: 106 + doc: 'Applies selected feature selection statistical functions or models on + our ''df_artifact''. + + + Each statistical function or model will vote for it''s best K selected features. + + If a feature has >= ''min_votes'' votes, it will be selected.' + has_kwargs: false parameters: - name: context doc: the function context. @@ -78,26 +88,16 @@ spec: type: bool doc: bool stating if the data is passed as a feature vector. default: false - name: feature_selection + lineno: 106 has_varargs: false - has_kwargs: false - doc: 'Applies selected feature selection statistical functions or models on - our ''df_artifact''. - - - Each statistical function or model will vote for it''s best K selected features. - - If a feature has >= ''min_votes'' votes, it will be selected.' + name: feature_selection + disable_auto_mount: false + command: '' build: origin_filename: '' functionSourceCode: # Copyright 2019 Iguazio
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import json
import os

import matplotlib.pyplot as plt
import mlrun
import mlrun.datastore
import mlrun.utils
import mlrun.feature_store as fs
import numpy as np
import pandas as pd
import seaborn as sns
from mlrun.artifacts import PlotArtifact
from mlrun.datastore.targets import ParquetTarget
# MLRun utils
from mlrun.utils.helpers import create_class
# Feature selection strategies
from sklearn.feature_selection import SelectFromModel, SelectKBest
# Scale feature scoresgit st
from sklearn.preprocessing import MinMaxScaler
# SKLearn estimators list
from sklearn.utils import all_estimators

DEFAULT_STAT_FILTERS = ["f_classif", "mutual_info_classif", "chi2", "f_regression"]
DEFAULT_MODEL_FILTERS = {
    "LinearSVC": "LinearSVC",
    "LogisticRegression": "LogisticRegression",
    "ExtraTreesClassifier": "ExtraTreesClassifier",
}


def _clear_current_figure():
    """
    Clear matplotlib current figure.
    """
    plt.cla()
    plt.clf()
    plt.close()


def show_values_on_bars(axs, h_v="v", space=0.4):
    def _show_on_single_plot(ax_):
        if h_v == "v":
            for p in ax_.patches:
                _x = p.get_x() + p.get_width() / 2
                _y = p.get_y() + p.get_height()
                value = int(p.get_height())
                ax_.text(_x, _y, value, ha="center")
        elif h_v == "h":
            for p in ax_.patches:
                _x = p.get_x() + p.get_width() + float(space)
                _y = p.get_y() + p.get_height()
                value = int(p.get_width())
                ax_.text(_x, _y, value, ha="left")

    if isinstance(axs, np.ndarray):
        for idx, ax in np.ndenumerate(axs):
            _show_on_single_plot(ax)
    else:
        _show_on_single_plot(axs)


def plot_stat(context, stat_name, stat_df):
    _clear_current_figure()

    # Add chart
    ax = plt.axes()
    stat_chart = sns.barplot(
        x=stat_name,
        y="index",
        data=stat_df.sort_values(stat_name, ascending=False).reset_index(),
        ax=ax,
    )
    plt.tight_layout()

    for p in stat_chart.patches:
        width = p.get_width()
        plt.text(
            5 + p.get_width(),
            p.get_y() + 0.55 * p.get_height(),
            "{:1.2f}".format(width),
            ha="center",
            va="center",
        )

    context.log_artifact(
        PlotArtifact(f"{stat_name}", body=plt.gcf()),
        local_path=os.path.join("plots", "feature_selection", f"{stat_name}.html"),
    )
    _clear_current_figure()


def feature_selection(
    context,
    df_artifact,
    k: int = 5,
    min_votes: float = 0.5,
    label_column: str = None,
    stat_filters: list = None,
    model_filters: dict = None,
    max_scaled_scores: bool = True,
    sample_ratio: float = None,
    output_vector_name: float = None,
    ignore_type_errors: bool = False,
    is_feature_vector: bool = False,
):
    """
    Applies selected feature selection statistical functions or models on our 'df_artifact'.

    Each statistical function or model will vote for it's best K selected features.
    If a feature has >= 'min_votes' votes, it will be selected.

    :param context:             the function context.
    :param df_artifact:         dataframe to pass as input.
    :param k:                   number of top features to select from each statistical
                                function or model.
    :param min_votes:           minimal number of votes (from a model or by statistical
                                function) needed for a feature to be selected.
                                Can be specified by percentage of votes or absolute
                                number of votes.
    :param label_column:        ground-truth (y) labels.
    :param stat_filters:        statistical functions to apply to the features
                                (from sklearn.feature_selection).
    :param model_filters:       models to use for feature evaluation, can be specified by
                                model name (ex. LinearSVC), formalized json (contains 'CLASS',
                                'FIT', 'META') or a path to such json file.
    :param max_scaled_scores:   produce feature scores table scaled with max_scaler.
    :param sample_ratio:        percentage of the dataset the user whishes to compute the feature selection process on.
    :param output_vector_name:  creates a new feature vector containing only the identifies features.
    :param ignore_type_errors:  skips datatypes that are neither float nor int within the feature vector.
    :param is_feature_vector:   bool stating if the data is passed as a feature vector.
    """
    stat_filters = stat_filters or DEFAULT_STAT_FILTERS
    model_filters = model_filters or DEFAULT_MODEL_FILTERS
    # Check if df.meta is valid, if it is, look for a feature vector
    store_uri_prefix, _ = mlrun.datastore.parse_store_uri(df_artifact.artifact_url)
    is_feature_vector = mlrun.utils.StorePrefix.FeatureVector == store_uri_prefix

    # Look inside meta.spec.label_feature to identify the label_column if the user did not specify it
    if label_column is None:
        if is_feature_vector:
            label_column = df_artifact.meta.spec.label_feature.split(".")[1]
        else:
            raise ValueError("No label_column was given, please add a label_column.")

    # Use the feature vector as dataframe
    df = df_artifact.as_df()

    # Ensure k is not bigger than the total number of features
    if k > df.shape[1]:
        raise ValueError(
            f"K cannot be bigger than the total number of features ({df.shape[1]}). Please choose a smaller K."
        )
    elif k < 1:
        raise ValueError("K cannot be smaller than 1. Please choose a bigger K.")

    # Create a sample dataframe of the original feature vector
    if sample_ratio:
        df = (
            df.groupby(label_column)
            .apply(lambda x: x.sample(frac=sample_ratio))
            .reset_index(drop=True)
        )
        df = df.dropna()

    # Set feature vector and labels
    y = df.pop(label_column)
    X = df

    if np.object_ in list(X.dtypes) and ignore_type_errors is False:
        raise ValueError(
            f"{df.select_dtypes(include=['object']).columns.tolist()} are neither float or int."
        )

    # Create selected statistical estimators
    stat_functions_list = {
        stat_name: SelectKBest(
            score_func=create_class(f"sklearn.feature_selection.{stat_name}"), k=k
        )
        for stat_name in stat_filters
    }
    requires_abs = ["chi2"]

    # Run statistic filters
    selected_features_agg = {}
    stats_df = pd.DataFrame(index=X.columns).dropna()

    for stat_name, stat_func in stat_functions_list.items():
        try:
            params = (X, y) if stat_name in requires_abs else (abs(X), y)
            stat = stat_func.fit(*params)

            # Collect stat function results
            stat_df = pd.DataFrame(
                index=X.columns, columns=[stat_name], data=stat.scores_
            )
            plot_stat(context, stat_name, stat_df)
            stats_df = stats_df.join(stat_df)

            # Select K Best features
            selected_features = X.columns[stat_func.get_support()]
            selected_features_agg[stat_name] = selected_features

        except Exception as e:
            context.logger.info(f"Couldn't calculate {stat_name} because of: {e}")

    # Create models from class name / json file / json params
    all_sklearn_estimators = dict(all_estimators()) if len(model_filters) > 0 else {}
    selected_models = {}
    for model_name, model in model_filters.items():
        if ".json" in model:
            current_model = json.load(open(model, "r"))
            classifier_class = create_class(current_model["META"]["class"])
            selected_models[model_name] = classifier_class(**current_model["CLASS"])
        elif model in all_sklearn_estimators:
            selected_models[model_name] = all_sklearn_estimators[model_name]()

        else:
            try:
                current_model = json.loads(model)
                classifier_class = create_class(current_model["META"]["class"])
                selected_models[model_name] = classifier_class(**current_model["CLASS"])
            except Exception as e:
                context.logger.info(f"unable to load {model} because of: {e}")

    # Run model filters
    models_df = pd.DataFrame(index=X.columns)
    for model_name, model in selected_models.items():

        if model_name == "LogisticRegression":
            model.set_params(solver="liblinear")

        # Train model and get feature importance
        select_from_model = SelectFromModel(model).fit(X, y)
        feature_idx = select_from_model.get_support()
        feature_names = X.columns[feature_idx]
        selected_features_agg[model_name] = feature_names.tolist()

        # Collect model feature importance
        if hasattr(select_from_model.estimator_, "coef_"):
            stat_df = select_from_model.estimator_.coef_
        elif hasattr(select_from_model.estimator_, "feature_importances_"):
            stat_df = select_from_model.estimator_.feature_importances_

        stat_df = pd.DataFrame(index=X.columns, columns=[model_name], data=stat_df[0])
        models_df = models_df.join(stat_df)

        plot_stat(context, model_name, stat_df)

    # Create feature_scores DF with stat & model filters scores
    result_matrix_df = pd.concat([stats_df, models_df], axis=1, sort=False)
    context.log_dataset(
        key="feature_scores",
        df=result_matrix_df,
        local_path="feature_scores.parquet",
        format="parquet",
    )
    if max_scaled_scores:
        normalized_df = result_matrix_df.replace([np.inf, -np.inf], np.nan).values
        min_max_scaler = MinMaxScaler()
        normalized_df = min_max_scaler.fit_transform(normalized_df)
        normalized_df = pd.DataFrame(
            data=normalized_df,
            columns=result_matrix_df.columns,
            index=result_matrix_df.index,
        )
        context.log_dataset(
            key="max_scaled_scores_feature_scores",
            df=normalized_df,
            local_path="max_scaled_scores_feature_scores.parquet",
            format="parquet",
        )

    # Create feature count DataFrame
    for test_name in selected_features_agg:
        result_matrix_df[test_name] = [
            1 if x in selected_features_agg[test_name] else 0 for x in X.columns
        ]
    result_matrix_df.loc[:, "num_votes"] = result_matrix_df.sum(axis=1)
    context.log_dataset(
        key="selected_features_count",
        df=result_matrix_df,
        local_path="selected_features_count.parquet",
        format="parquet",
    )

    # How many votes are needed for a feature to be selected?
    if isinstance(min_votes, int):
        votes_needed = min_votes
    else:
        num_filters = len(stat_filters) + len(model_filters)
        votes_needed = int(np.floor(num_filters * max(min(min_votes, 1), 0)))
    context.logger.info(f"votes needed to be selected: {votes_needed}")

    # Create final feature dataframe
    selected_features = result_matrix_df[
        result_matrix_df.num_votes >= votes_needed
    ].index.tolist()
    good_feature_df = df.loc[:, selected_features]
    final_df = pd.concat([good_feature_df, y], axis=1)
    context.log_dataset(
        key="selected_features",
        df=final_df,
        local_path="selected_features.parquet",
        format="parquet",
    )

    # Creating a new feature vector containing only the identified top features
    if is_feature_vector and df_artifact.meta.spec.features and output_vector_name:
        # Selecting the top K features from our top feature dataframe
        selected_features = result_matrix_df.head(k).index

        # Match the selected feature names to the FS Feature annotations
        matched_selections = [
            feature
            for feature in list(df_artifact.meta.spec.features)
            for selected in list(selected_features)
            if feature.endswith(selected)
        ]

        # Defining our new feature vector
        top_features_fv = fs.FeatureVector(
            output_vector_name,
            matched_selections,
            label_feature="labels.label",
            description="feature vector composed strictly of our top features",
        )

        # Saving
        top_features_fv.save()
        fs.get_offline_features(top_features_fv, target=ParquetTarget())

        # Logging our new feature vector URI
        context.log_result("top_features_vector", top_features_fv.uri)
 code_origin: '' default_handler: feature_selection - description: Select features through multiple Statistical and Model filters image: mlrun/mlrun -metadata: - categories: - - data-preparation - - machine-learning - tag: '' - name: feature-selection + description: Select features through multiple Statistical and Model filters +verbose: false diff --git a/feature_selection/item.yaml b/feature_selection/item.yaml index 1c79e6f12..c7400a7f0 100644 --- a/feature_selection/item.yaml +++ b/feature_selection/item.yaml @@ -22,4 +22,4 @@ spec: kind: job requirements: [] url: '' -version: 1.5.0 +version: 1.7.0 From d1d61a0f1765749cdd5ee0d4225aea1c60093f88 Mon Sep 17 00:00:00 2001 From: Avi Asulin Date: Mon, 2 Sep 2024 16:09:01 +0300 Subject: [PATCH 4/8] Revert "[onnx utils] update onnx utils packages" This reverts commit 88727986ffa91662593958023be8ac3ccef2cab0. --- onnx_utils/function.yaml | 42 ++++++++++++++++++----------------- onnx_utils/item.yaml | 16 ++++++------- onnx_utils/requirements.txt | 12 +++++----- onnx_utils/test_onnx_utils.py | 4 ++-- 4 files changed, 38 insertions(+), 36 deletions(-) diff --git a/onnx_utils/function.yaml b/onnx_utils/function.yaml index 88f810fb4..7a0054c4d 100644 --- a/onnx_utils/function.yaml +++ b/onnx_utils/function.yaml @@ -2,7 +2,7 @@ kind: job metadata: name: onnx-utils tag: '' - hash: fd6cd909ef6e055c348b44a0313e190513cd755b + hash: 0c4a6491b976d5220d3ebfb83a3905dd47e86be2 project: '' labels: author: guyl @@ -16,16 +16,16 @@ spec: functionSourceCode: # Copyright 2019 Iguazio
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import Any, Callable, Dict, List, Tuple

import mlrun


class _ToONNXConversions:
    """
    An ONNX conversion functions library class.
    """

    @staticmethod
    def tf_keras_to_onnx(
        model_handler,
        onnx_model_name: str = None,
        optimize_model: bool = True,
        input_signature: List[Tuple[Tuple[int], str]] = None,
    ):
        """
        Convert a TF.Keras model to an ONNX model and log it back to MLRun as a new model object.

        :param model_handler:   An initialized TFKerasModelHandler with a loaded model to convert to ONNX.
        :param onnx_model_name: The name to use to log the converted ONNX model. If not given, the given `model_name`
                                will be used with an additional suffix `_onnx`. Defaulted to None.
        :param optimize_model:  Whether or not to optimize the ONNX model using 'onnxoptimizer' before saving the model.
                                Defaulted to True.
        :param input_signature: A list of the input layers shape and data type properties. Expected to receive a list
                                where each element is an input layer tuple. An input layer tuple is a tuple of:
                                [0] = Layer's shape, a tuple of integers.
                                [1] = Layer's data type, a mlrun.data_types.ValueType string.
                                If None, the input signature will be tried to be read from the model artifact. Defaulted
                                to None.
        """
        # Import the framework and handler:
        import tensorflow as tf
        from mlrun.frameworks.tf_keras import TFKerasUtils

        # Check the given 'input_signature' parameter:
        if input_signature is None:
            # Read the inputs from the model:
            try:
                model_handler.read_inputs_from_model()
            except Exception as error:
                raise mlrun.errors.MLRunRuntimeError(
                    f"Please provide the 'input_signature' parameter. The function tried reading the input layers "
                    f"information automatically but failed with the following error: {error}"
                )
        else:
            # Parse the 'input_signature' parameter:
            input_signature = [
                tf.TensorSpec(
                    shape=shape,
                    dtype=TFKerasUtils.convert_value_type_to_tf_dtype(
                        value_type=value_type
                    ),
                )
                for (shape, value_type) in input_signature
            ]

        # Convert to ONNX:
        model_handler.to_onnx(
            model_name=onnx_model_name,
            input_signature=input_signature,
            optimize=optimize_model,
        )

    @staticmethod
    def pytorch_to_onnx(
        model_handler,
        onnx_model_name: str = None,
        optimize_model: bool = True,
        input_signature: List[Tuple[Tuple[int, ...], str]] = None,
        input_layers_names: List[str] = None,
        output_layers_names: List[str] = None,
        dynamic_axes: Dict[str, Dict[int, str]] = None,
        is_batched: bool = True,
    ):
        """
        Convert a PyTorch model to an ONNX model and log it back to MLRun as a new model object.

        :param model_handler:       An initialized PyTorchModelHandler with a loaded model to convert to ONNX.
        :param onnx_model_name:     The name to use to log the converted ONNX model. If not given, the given
                                    `model_name` will be used with an additional suffix `_onnx`. Defaulted to None.
        :param optimize_model:      Whether or not to optimize the ONNX model using 'onnxoptimizer' before saving the
                                    model. Defaulted to True.
        :param input_signature:     A list of the input layers shape and data type properties. Expected to receive a
                                    list where each element is an input layer tuple. An input layer tuple is a tuple of:
                                    [0] = Layer's shape, a tuple of integers.
                                    [1] = Layer's data type, a mlrun.data_types.ValueType string.
                                    If None, the input signature will be tried to be read from the model artifact.
                                    Defaulted to None.
        :param input_layers_names:  List of names to assign to the input nodes of the graph in order. All of the other
                                    parameters (inner layers) can be set as well by passing additional names in the
                                    list. The order is by the order of the parameters in the model. If None, the inputs
                                    will be read from the handler's inputs. If its also None, it is defaulted to:
                                    "input_0", "input_1", ...
        :param output_layers_names: List of names to assign to the output nodes of the graph in order. If None, the
                                    outputs will be read from the handler's outputs. If its also None, it is defaulted
                                    to: "output_0" (for multiple outputs, this parameter must be provided).
        :param dynamic_axes:        If part of the input / output shape is dynamic, like (batch_size, 3, 32, 32) you can
                                    specify it by giving a dynamic axis to the input / output layer by its name as
                                    follows: {
                                        "input layer name": {0: "batch_size"},
                                        "output layer name": {0: "batch_size"},
                                    }
                                    If provided, the 'is_batched' flag will be ignored. Defaulted to None.
        :param is_batched:          Whether to include a batch size as the first axis in every input and output layer.
                                    Defaulted to True. Will be ignored if 'dynamic_axes' is provided.
        """
        # Import the framework and handler:
        import torch
        from mlrun.frameworks.pytorch import PyTorchUtils

        # Parse the 'input_signature' parameter:
        if input_signature is not None:
            input_signature = tuple(
                [
                    torch.zeros(
                        size=shape,
                        dtype=PyTorchUtils.convert_value_type_to_torch_dtype(
                            value_type=value_type
                        ),
                    )
                    for (shape, value_type) in input_signature
                ]
            )

        # Convert to ONNX:
        model_handler.to_onnx(
            model_name=onnx_model_name,
            input_sample=input_signature,
            optimize=optimize_model,
            input_layers_names=input_layers_names,
            output_layers_names=output_layers_names,
            dynamic_axes=dynamic_axes,
            is_batched=is_batched
        )


# Map for getting the conversion function according to the provided framework:
_CONVERSION_MAP = {
    "tensorflow.keras": _ToONNXConversions.tf_keras_to_onnx,
    "torch": _ToONNXConversions.pytorch_to_onnx,
}  # type: Dict[str, Callable]


def to_onnx(
    context: mlrun.MLClientCtx,
    model_path: str,
    onnx_model_name: str = None,
    optimize_model: bool = True,
    framework_kwargs: Dict[str, Any] = None,
):
    """
    Convert the given model to an ONNX model.

    :param context:          The MLRun function execution context
    :param model_path:       The model path store object.
    :param onnx_model_name:  The name to use to log the converted ONNX model. If not given, the given `model_name` will
                             be used with an additional suffix `_onnx`. Defaulted to None.
    :param optimize_model:   Whether to optimize the ONNX model using 'onnxoptimizer' before saving the model. Defaulted
                             to True.
    :param framework_kwargs: Additional arguments each framework may require in order to convert to ONNX. To get the doc
                             string of the desired framework onnx conversion function, pass "help".
    """
    from mlrun.frameworks.auto_mlrun.auto_mlrun import AutoMLRun

    # Get a model handler of the required framework:
    model_handler = AutoMLRun.load_model(model_path=model_path, context=context)

    # Get the model's framework:
    framework = model_handler.FRAMEWORK_NAME

    # Use the conversion map to get the specific framework to onnx conversion:
    if framework not in _CONVERSION_MAP:
        raise mlrun.errors.MLRunInvalidArgumentError(
            f"The following framework: '{framework}', has no ONNX conversion."
        )
    conversion_function = _CONVERSION_MAP[framework]

    # Check if needed to print the function's doc string ("help" is passed):
    if framework_kwargs == "help":
        print(conversion_function.__doc__)
        return

    # Set the default empty framework kwargs if needed:
    if framework_kwargs is None:
        framework_kwargs = {}

    # Run the conversion:
    try:
        conversion_function(
            model_handler=model_handler,
            onnx_model_name=onnx_model_name,
            optimize_model=optimize_model,
            **framework_kwargs,
        )
    except TypeError as exception:
        raise mlrun.errors.MLRunInvalidArgumentError(
            f"ERROR: A TypeError exception was raised during the conversion:\n{exception}. "
            f"Please read the {framework} framework conversion function doc string by passing 'help' in the "
            f"'framework_kwargs' dictionary parameter."
        )


def optimize(
    context: mlrun.MLClientCtx,
    model_path: str,
    optimizations: List[str] = None,
    fixed_point: bool = False,
    optimized_model_name: str = None,
):
    """
    Optimize the given ONNX model.

    :param context:              The MLRun function execution context.
    :param model_path:           Path to the ONNX model object.
    :param optimizations:        List of possible optimizations. To see what optimizations are available, pass "help".
                                 If None, all of the optimizations will be used. Defaulted to None.
    :param fixed_point:          Optimize the weights using fixed point. Defaulted to False.
    :param optimized_model_name: The name of the optimized model. If None, the original model will be overridden.
                                 Defaulted to None.
    """
    # Import the model handler:
    import onnxoptimizer
    from mlrun.frameworks.onnx import ONNXModelHandler

    # Check if needed to print the available optimizations ("help" is passed):
    if optimizations == "help":
        available_passes = "\n* ".join(onnxoptimizer.get_available_passes())
        print(f"The available optimizations are:\n* {available_passes}")
        return

    # Create the model handler:
    model_handler = ONNXModelHandler(
        model_path=model_path, context=context
    )

    # Load the ONNX model:
    model_handler.load()

    # Optimize the model using the given configurations:
    model_handler.optimize(optimizations=optimizations, fixed_point=fixed_point)

    # Rename if needed:
    if optimized_model_name is not None:
        model_handler.set_model_name(model_name=optimized_model_name)

    # Log the optimized model:
    model_handler.log()
 base_image: mlrun/mlrun commands: [] - code_origin: '' - origin_filename: '' + code_origin: https://github.com/yonishelach/functions.git#f84b9565a33d8159315992ebba5838d41f6cc112:/Users/Yonatan_Shelach/projects/functions/onnx_utils/onnx_utils.py + origin_filename: /Users/Yonatan_Shelach/projects/functions/onnx_utils/onnx_utils.py with_mlrun: false auto_build: true requirements: - - onnx~=1.15.0 - - onnxruntime~=1.8.1 - - onnxoptimizer~=0.2.0 - - onnxmltools~=1.9.0 - - tf2onnx~=1.16.0 + - onnx~=1.13.0 + - onnxruntime~=1.14.0 + - onnxoptimizer~=0.3.0 + - onnxmltools~=1.11.0 + - tf2onnx~=1.13.0 entry_points: tf_keras_to_onnx: name: tf_keras_to_onnx @@ -35,6 +35,7 @@ spec: - name: model_handler doc: An initialized TFKerasModelHandler with a loaded model to convert to ONNX. + default: '' - name: onnx_model_name type: str doc: The name to use to log the converted ONNX model. If not given, the given @@ -54,10 +55,9 @@ spec: data type, a mlrun.data_types.ValueType string. If None, the input signature will be tried to be read from the model artifact. Defaulted to None.' default: null - outputs: [] + outputs: + - default: '' lineno: 26 - has_varargs: false - has_kwargs: false pytorch_to_onnx: name: pytorch_to_onnx doc: Convert a PyTorch model to an ONNX model and log it back to MLRun as a @@ -66,6 +66,7 @@ spec: - name: model_handler doc: An initialized PyTorchModelHandler with a loaded model to convert to ONNX. + default: '' - name: onnx_model_name type: str doc: The name to use to log the converted ONNX model. If not given, the given @@ -113,10 +114,9 @@ spec: doc: Whether to include a batch size as the first axis in every input and output layer. Defaulted to True. Will be ignored if 'dynamic_axes' is provided. default: true - outputs: [] + outputs: + - default: '' lineno: 81 - has_varargs: false - has_kwargs: false to_onnx: name: to_onnx doc: Convert the given model to an ONNX model. @@ -124,9 +124,11 @@ spec: - name: context type: MLClientCtx doc: The MLRun function execution context + default: '' - name: model_path type: str doc: The model path store object. + default: '' - name: onnx_model_name type: str doc: The name to use to log the converted ONNX model. If not given, the given @@ -144,10 +146,9 @@ spec: ONNX. To get the doc string of the desired framework onnx conversion function, pass "help". default: null - outputs: [] + outputs: + - default: '' lineno: 160 - has_varargs: false - has_kwargs: false optimize: name: optimize doc: Optimize the given ONNX model. @@ -155,9 +156,11 @@ spec: - name: context type: MLClientCtx doc: The MLRun function execution context. + default: '' - name: model_path type: str doc: Path to the ONNX model object. + default: '' - name: optimizations type: List[str] doc: List of possible optimizations. To see what optimizations are available, @@ -173,10 +176,9 @@ spec: doc: The name of the optimized model. If None, the original model will be overridden. Defaulted to None. default: null - outputs: [] + outputs: + - default: '' lineno: 219 - has_varargs: false - has_kwargs: false description: ONNX intigration in MLRun, some utils functions for the ONNX framework, optimizing and converting models from different framework to ONNX using MLRun. default_handler: to_onnx diff --git a/onnx_utils/item.yaml b/onnx_utils/item.yaml index 84486d9f8..36335837a 100644 --- a/onnx_utils/item.yaml +++ b/onnx_utils/item.yaml @@ -12,9 +12,9 @@ labels: author: guyl maintainers: [] marketplaceType: '' -mlrunVersion: 1.6.3 +mlrunVersion: 1.1.0 name: onnx_utils -platformVersion: 3.5.4 +platformVersion: 3.5.0 spec: extra_spec: allow_empty_resources: true @@ -26,10 +26,10 @@ spec: image: mlrun/mlrun kind: job requirements: - - onnx~=1.15.0 - - onnxruntime~=1.8.1 - - onnxoptimizer~=0.2.0 - - onnxmltools~=1.9.0 - - tf2onnx~=1.16.0 + - onnx~=1.13.0 + - onnxruntime~=1.14.0 + - onnxoptimizer~=0.3.0 + - onnxmltools~=1.11.0 + - tf2onnx~=1.13.0 url: '' -version: 1.3.0 +version: 1.2.0 diff --git a/onnx_utils/requirements.txt b/onnx_utils/requirements.txt index a9acd7371..dc7ff1e7b 100644 --- a/onnx_utils/requirements.txt +++ b/onnx_utils/requirements.txt @@ -1,11 +1,11 @@ tqdm~=4.62.3 -tensorflow~=2.13.0 +tensorflow~=2.7.0 torch~=1.10.0 torchvision~=0.11.1 -onnx~=1.15.0 -onnxruntime~=1.12.1 -onnxoptimizer~=0.3.0 +onnx~=1.10.1 +onnxruntime~=1.8.1 +onnxoptimizer~=0.2.0 onnxmltools~=1.9.0 -tf2onnx~=1.16.0 +tf2onnx~=1.9.0 plotly~=5.4.0 -#wrapt<1.15.0 # wrapt==1.15.0 fails tensorflow 2.7 Todo: please remove when updating tensorflow \ No newline at end of file +wrapt<1.15.0 # wrapt==1.15.0 fails tensorflow 2.7 Todo: please remove when updating tensorflow \ No newline at end of file diff --git a/onnx_utils/test_onnx_utils.py b/onnx_utils/test_onnx_utils.py index aaae96372..35b224c4a 100644 --- a/onnx_utils/test_onnx_utils.py +++ b/onnx_utils/test_onnx_utils.py @@ -257,7 +257,7 @@ def test_pytorch_to_onnx(): filename="test_onnx_utils.py", name="log_model", kind="job", - image="mlrun/mlrun", + image="mlrun/ml-models", ) # Run the function to log the model: @@ -341,7 +341,7 @@ def test_optimize(): filename="test_onnx_utils.py", name="log_model", kind="job", - image="mlrun/mlrun", + image="mlrun/ml-models", ) # Run the function to log the model: From 98d09427db1cc9bdd08e3c498289e0bb35664df6 Mon Sep 17 00:00:00 2001 From: Avi Asulin Date: Mon, 2 Sep 2024 16:12:12 +0300 Subject: [PATCH 5/8] [feature selection] update function yaml --- feature_selection/item.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/feature_selection/item.yaml b/feature_selection/item.yaml index c7400a7f0..1b25ec410 100644 --- a/feature_selection/item.yaml +++ b/feature_selection/item.yaml @@ -12,7 +12,7 @@ labels: author: orz maintainers: [] marketplaceType: '' -mlrunVersion: 1.7.0 +mlrunVersion: 1.6.4 name: feature-selection platformVersion: 3.6.0 spec: From 2fb20802033039178fcf94ad33eff63155e9ff37 Mon Sep 17 00:00:00 2001 From: Avi Asulin Date: Mon, 2 Sep 2024 16:34:04 +0300 Subject: [PATCH 6/8] [feature selection] update function yaml --- feature_selection/requirements.txt | 2 +- feature_selection/test_feature_selection.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/feature_selection/requirements.txt b/feature_selection/requirements.txt index 961f64ea4..70a079c7d 100644 --- a/feature_selection/requirements.txt +++ b/feature_selection/requirements.txt @@ -1,4 +1,4 @@ -scikit-learn~=1.0.2 +scikit-learn matplotlib seaborn scikit-plot diff --git a/feature_selection/test_feature_selection.py b/feature_selection/test_feature_selection.py index 6289648f2..d21e648ff 100644 --- a/feature_selection/test_feature_selection.py +++ b/feature_selection/test_feature_selection.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # -from mlrun import code_to_function +from mlrun import code_to_function, get_dataitem from pathlib import Path import shutil @@ -44,5 +44,5 @@ def test_run_local_feature_selection(): inputs={'df_artifact': 'data/metrics.pq'}, artifact_path='artifacts/', ) - assert run.artifact('feature_scores').get() and run.artifact('selected_features').get() + assert run.outputs['feature_scores'] and run.outputs['selected_features'] _delete_outputs({ARTIFACTS_PATH, RUNS_PATH, SCHEDULES_PATH}) From 7f3403ced7d291e52305b272c185f7506c84cdca Mon Sep 17 00:00:00 2001 From: Avi Asulin <34214569+aviaIguazio@users.noreply.github.com> Date: Thu, 12 Sep 2024 08:42:06 +0300 Subject: [PATCH 7/8] Update feature_selection/test_feature_selection.py Co-authored-by: Eyal Danieli --- feature_selection/test_feature_selection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/feature_selection/test_feature_selection.py b/feature_selection/test_feature_selection.py index d21e648ff..9cb5ca621 100644 --- a/feature_selection/test_feature_selection.py +++ b/feature_selection/test_feature_selection.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # -from mlrun import code_to_function, get_dataitem +from mlrun import code_to_function from pathlib import Path import shutil From 75dbafd698c6cfe41aa59b9663b22c3ea2a2b64f Mon Sep 17 00:00:00 2001 From: Avi Asulin <34214569+aviaIguazio@users.noreply.github.com> Date: Thu, 12 Sep 2024 08:43:08 +0300 Subject: [PATCH 8/8] Update item.yaml --- feature_selection/item.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/feature_selection/item.yaml b/feature_selection/item.yaml index 1b25ec410..99675b4e8 100644 --- a/feature_selection/item.yaml +++ b/feature_selection/item.yaml @@ -22,4 +22,4 @@ spec: kind: job requirements: [] url: '' -version: 1.7.0 +version: 1.5.0