From 529c2c017b9d3ae39a1372964fac24c775b851ed Mon Sep 17 00:00:00 2001 From: bvenn Date: Tue, 7 Jan 2025 20:06:31 +0100 Subject: [PATCH] rename Impute module to Imputation closes #341 --- docs/Imputation.fsx | 14 +-- src/FSharp.Stats/FSharp.Stats.fsproj | 1 + src/FSharp.Stats/ML/Imputation.fs | 170 +++++++++++++++++++++++++++ src/FSharp.Stats/ML/Impute.fs | 111 +++-------------- 4 files changed, 196 insertions(+), 100 deletions(-) create mode 100644 src/FSharp.Stats/ML/Imputation.fs diff --git a/docs/Imputation.fsx b/docs/Imputation.fsx index de85f9df..a33c2938 100644 --- a/docs/Imputation.fsx +++ b/docs/Imputation.fsx @@ -62,8 +62,8 @@ Missing data imputation based on the k-nearest neighbour algorithm: *) // init kNearest MatrixBaseImpute -let kn : Impute.MatrixBaseImputation = Impute.kNearestImpute 2 -let imputedData = Impute.imputeBy kn Ops.isNan data +let kn : Imputation.MatrixBaseImputation = Imputation.kNearestImpute 2 +let imputedData = Imputation.imputeBy kn Ops.isNan data (*** hide ***) let imputedDataMatrix = "k nearest neighbours imputed data\r\n" + FSharp.Stats.FSIPrinters.matrix (matrix imputedData) @@ -79,10 +79,10 @@ let imputedDataMatrix = "k nearest neighbours imputed data\r\n" + FSharp.Stats.F *) // init random VectorBaseImpute -let rnd = Impute.rnd (System.Random()) +let rnd = Imputation.rnd (System.Random()) -let rndRowWise = Impute.imputeRowWiseBy rnd Ops.isNan data -let rndColWise = Impute.imputeColWiseBy rnd Ops.isNan data +let rndRowWise = Imputation.imputeRowWiseBy rnd Ops.isNan data +let rndColWise = Imputation.imputeColWiseBy rnd Ops.isNan data (*** hide ***) let rndRowDataMatrix = "rndRowDataMatrix imputed data\r\n" + FSharp.Stats.FSIPrinters.matrix (matrix rndRowWise) @@ -100,8 +100,8 @@ let rndColDataMatrix = "rndColDataMatrix imputed data\r\n" + FSharp.Stats.FSIPri *) -let normalRowWise = Impute.imputeRowWiseBy Impute.normal Ops.isNan data -let normalColWise = Impute.imputeColWiseBy Impute.normal Ops.isNan data +let normalRowWise = Imputation.imputeRowWiseBy Imputation.normal Ops.isNan data +let normalColWise = Imputation.imputeColWiseBy Imputation.normal Ops.isNan data (*** hide ***) diff --git a/src/FSharp.Stats/FSharp.Stats.fsproj b/src/FSharp.Stats/FSharp.Stats.fsproj index 73c712a9..12bc4e58 100644 --- a/src/FSharp.Stats/FSharp.Stats.fsproj +++ b/src/FSharp.Stats/FSharp.Stats.fsproj @@ -156,6 +156,7 @@ + diff --git a/src/FSharp.Stats/ML/Imputation.fs b/src/FSharp.Stats/ML/Imputation.fs new file mode 100644 index 00000000..53ddae05 --- /dev/null +++ b/src/FSharp.Stats/ML/Imputation.fs @@ -0,0 +1,170 @@ +namespace FSharp.Stats.ML + +open FSharp.Stats + +/// Module for data imputation and missing value filtering +module Imputation = + + module Cleaning = + + let calcFractionBy (isMissing) (dataRow:seq<'a>) = + dataRow + |> Seq.fold (fun (mc,nmc) state -> + match isMissing state with + | true -> (mc+1,nmc) + | false -> (mc,nmc+1) ) + (0,0) + |> fun (mc,nmc) -> float mc / float (nmc + mc) + + + let removeAllBy f threshold (data:seq<#seq<'a>>) = + data + |> Seq.filter (fun row -> f row <= threshold ) + + + /// Type definintion for a vector based imputation. + /// The imputed values are based only on the given array + type VectorBaseImputation<'a> = seq<'a> -> int -> 'a + + /// Type definintion for a vector based imputation + /// The imputed values are based on the given whole dataset + type MatrixBaseImputation<'a,'b> = seq<'a> -> 'a -> int -> 'b + + + /// Imputation by random sampling from the input vector + /// + /// + /// + /// + /// + /// + /// + let rnd (rnd:System.Random) : VectorBaseImputation<'a> = + fun fdata index -> + let farr = Array.ofSeq fdata + if farr.Length < 1 then failwithf "Vector needs at least one non-missing value" + farr.[rnd.Next(0,farr.Length - 1)] + + + /// Imputation by sampling from a gausian normal distribution based on the input vector + let normal : VectorBaseImputation = + fun fdata index -> + let mean = Seq.mean fdata + let std = Seq.stDev fdata + if not(System.Double.IsNaN(mean) || System.Double.IsNaN(std)) then + Distributions.Continuous.Normal.Sample mean std + else + failwithf "Vector needs at least two non-missing value" + + + ///// Imputation by sampling from a gausian normal distribution based on the input vector + //let normalTruncated : VectorBaseImputation = + // fun fdata index -> + // let mean = Seq.mean fdata + // let std = Seq.stDev fdata + // if not(System.Double.IsNaN(mean) || System.Double.IsNaN(std)) then + // Distributions.Continuous.Normal.Sample mean std + // else + // failwithf "Vector needs at least two non-missing value" + + + /// Imputation by k-nearest neighbour + /// + /// + /// + /// + /// + /// + /// + let kNearestImpute k : MatrixBaseImputation = + fun data arr index -> + + let kNearestFrom (distance:DistanceMetrics.Distance<'a>) k (arr: 'a array) (queryCoordinates:'a) = + arr + |> Array.map (fun t -> (distance t queryCoordinates,t)) + |> Array.sortBy fst + |> Array.take k + + let euclNanSq = DistanceMetrics.euclideanNaNSquared + let tmpArr = + kNearestFrom euclNanSq k (data |> Array.ofSeq) arr + |> Array.map snd + |> JaggedArray.transpose + |> Array.map Seq.mean + tmpArr.[index] + + + /// Imputes column-wise by vector-based imputation + /// + /// + /// + /// + /// + /// + /// + /// + /// + let imputeColWiseBy (impute: VectorBaseImputation<'a>) isMissing (data : seq<#seq<'a>>) = + data + |> JaggedArray.ofJaggedSeq + |> JaggedArray.transpose + |> Array.map (fun col -> + let fCol = col |> Array.filter (isMissing >> not) + let impute' = impute fCol + col + |> Array.mapi (fun i v -> if isMissing v then (impute' i) else v) + ) + |> JaggedArray.transpose + + + /// Imputes row-wise by vector-based imputation + /// + /// + /// + /// + /// + /// + /// + /// + /// + let imputeRowWiseBy (impute: VectorBaseImputation<'a>) isMissing (data : seq<#seq<'a>>) = + data + |> JaggedArray.ofJaggedSeq + |> Array.map (fun row -> + let fRow = row |> Array.filter (isMissing >> not) + let impute' = impute fRow + row + |> Array.mapi (fun i v -> if isMissing v then (impute' i) else v) + ) + + + /// Imputes rows by matrix-based imputation + /// + /// + /// + /// + /// + /// + /// + /// + /// + let imputeBy (impute: MatrixBaseImputation<'a[],'a>) isMissing data = + let fData = + data + |> Seq.filter (fun row -> row |> Seq.exists isMissing |> not) + |> Seq.map (fun row -> row |> Seq.toArray) + |> Seq.toArray + + data + |> JaggedArray.ofJaggedSeq + |> Array.map (fun row -> + let row' = row |> Array.ofSeq + let impute' = impute fData row' + row' + |> Array.mapi (fun i v -> if isMissing v then (impute' i) else v) + ) + + + + + diff --git a/src/FSharp.Stats/ML/Impute.fs b/src/FSharp.Stats/ML/Impute.fs index cb2adbba..e1139ac5 100644 --- a/src/FSharp.Stats/ML/Impute.fs +++ b/src/FSharp.Stats/ML/Impute.fs @@ -1,37 +1,20 @@ namespace FSharp.Stats.ML open FSharp.Stats -//open FSharp.Care -//open FSharp.Care.Collections +open System /// Module for data imputation and missing value filtering +[] module Impute = module Cleaning = - + let calcFractionBy (isMissing) (dataRow:seq<'a>) = - dataRow - |> Seq.fold (fun (mc,nmc) state -> - match isMissing state with - | true -> (mc+1,nmc) - | false -> (mc,nmc+1) ) - (0,0) - |> fun (mc,nmc) -> float mc / float (nmc + mc) + Imputation.Cleaning.calcFractionBy isMissing dataRow let removeAllBy f threshold (data:seq<#seq<'a>>) = - data - |> Seq.filter (fun row -> f row <= threshold ) - - - /// Type definintion for a vector based imputation. - /// The imputed values are based only on the given array - type VectorBaseImputation<'a> = seq<'a> -> int -> 'a - - /// Type definintion for a vector based imputation - /// The imputed values are based on the given whole dataset - type MatrixBaseImputation<'a,'b> = seq<'a> -> 'a -> int -> 'b - + Imputation.Cleaning.removeAllBy f threshold data /// Imputation by random sampling from the input vector /// @@ -41,22 +24,13 @@ module Impute = /// /// /// - let rnd (rnd:System.Random) : VectorBaseImputation<'a> = - fun fdata index -> - let farr = Array.ofSeq fdata - if farr.Length < 1 then failwithf "Vector needs at least one non-missing value" - farr.[rnd.Next(0,farr.Length - 1)] + let rnd (rnd:System.Random) : Imputation.VectorBaseImputation<'a> = + Imputation.rnd rnd /// Imputation by sampling from a gausian normal distribution based on the input vector - let normal : VectorBaseImputation = - fun fdata index -> - let mean = Seq.mean fdata - let std = Seq.stDev fdata - if not(System.Double.IsNaN(mean) || System.Double.IsNaN(std)) then - Distributions.Continuous.Normal.Sample mean std - else - failwithf "Vector needs at least two non-missing value" + let normal: Imputation.VectorBaseImputation = + Imputation.normal ///// Imputation by sampling from a gausian normal distribution based on the input vector @@ -78,23 +52,8 @@ module Impute = /// /// /// - let kNearestImpute k : MatrixBaseImputation = - fun data arr index -> - - let kNearestFrom (distance:DistanceMetrics.Distance<'a>) k (arr: 'a array) (queryCoordinates:'a) = - arr - |> Array.map (fun t -> (distance t queryCoordinates,t)) - |> Array.sortBy fst - |> Array.take k - - let euclNanSq = DistanceMetrics.euclideanNaNSquared - let tmpArr = - kNearestFrom euclNanSq k (data |> Array.ofSeq) arr - |> Array.map snd - |> JaggedArray.transpose - |> Array.map Seq.mean - tmpArr.[index] - + let kNearestImpute k : Imputation.MatrixBaseImputation = + Imputation.kNearestImpute k /// Imputes column-wise by vector-based imputation /// @@ -106,18 +65,9 @@ module Impute = /// /// /// - let imputeColWiseBy (impute: VectorBaseImputation<'a>) isMissing (data : seq<#seq<'a>>) = - data - |> JaggedArray.ofJaggedSeq - |> JaggedArray.transpose - |> Array.map (fun col -> - let fCol = col |> Array.filter (isMissing >> not) - let impute' = impute fCol - col - |> Array.mapi (fun i v -> if isMissing v then (impute' i) else v) - ) - |> JaggedArray.transpose - + let imputeColWiseBy (impute: Imputation.VectorBaseImputation<'a>) isMissing (data : seq<#seq<'a>>) = + Imputation.imputeColWiseBy impute isMissing data + /// Imputes row-wise by vector-based imputation /// @@ -129,15 +79,8 @@ module Impute = /// /// /// - let imputeRowWiseBy (impute: VectorBaseImputation<'a>) isMissing (data : seq<#seq<'a>>) = - data - |> JaggedArray.ofJaggedSeq - |> Array.map (fun row -> - let fRow = row |> Array.filter (isMissing >> not) - let impute' = impute fRow - row - |> Array.mapi (fun i v -> if isMissing v then (impute' i) else v) - ) + let imputeRowWiseBy (impute: Imputation.VectorBaseImputation<'a>) isMissing (data : seq<#seq<'a>>) = + Imputation.imputeRowWiseBy impute isMissing data /// Imputes rows by matrix-based imputation @@ -150,23 +93,5 @@ module Impute = /// /// /// - let imputeBy (impute: MatrixBaseImputation<'a[],'a>) isMissing data = - let fData = - data - |> Seq.filter (fun row -> row |> Seq.exists isMissing |> not) - |> Seq.map (fun row -> row |> Seq.toArray) - |> Seq.toArray - - data - |> JaggedArray.ofJaggedSeq - |> Array.map (fun row -> - let row' = row |> Array.ofSeq - let impute' = impute fData row' - row' - |> Array.mapi (fun i v -> if isMissing v then (impute' i) else v) - ) - - - - - + let imputeBy (impute: Imputation.MatrixBaseImputation<'a[],'a>) isMissing data = + Imputation.imputeBy impute isMissing data