From 283fa419d900249d0befef6b0d37e7bafea33ea2 Mon Sep 17 00:00:00 2001
From: Donato Meoli <donato.meoli.95@gmail.com>
Date: Mon, 7 Oct 2019 12:13:29 +0200
Subject: [PATCH] moved util functions to utils.py, moved probability learners
 from learning.py to probabilistic_learning.py with tests, fixed typos and
 fixed imports in .ipynb files (#1120)

* changed queue to set in AC3

Changed queue to set in AC3 (as in the pseudocode of the original algorithm) to reduce the number of consistency-check due to the redundancy of the same arcs in queue. For example, on the harder1 configuration of the Sudoku CSP the number consistency-check has been reduced from 40464 to 12562!

* re-added test commented by mistake

* added the mentioned AC4 algorithm for constraint propagation

AC3 algorithm has non-optimal worst case time-complexity O(cd^3 ), while AC4 algorithm runs in O(cd^2) worst case time

* added doctest in Sudoku for AC4 and and the possibility of choosing the constant propagation algorithm in mac inference

* removed useless doctest for AC4 in Sudoku because AC4's tests are already present in test_csp.py

* added map coloring SAT problems

* fixed typo errors and removed unnecessary brackets

* reformulated the map coloring problem

* Revert "reformulated the map coloring problem"

This reverts commit 20ab0e5afa238a0556e68f173b07ad32d0779d3b.

* Revert "fixed typo errors and removed unnecessary brackets"

This reverts commit f743146c43b28e0525b0f0b332faebc78c15946f.

* Revert "added map coloring SAT problems"

This reverts commit 9e0fa550e85081cf5b92fb6a3418384ab5a9fdfd.

* Revert "removed useless doctest for AC4 in Sudoku because AC4's tests are already present in test_csp.py"

This reverts commit b3cd24c511a82275f5b43c9f176396e6ba05f67e.

* Revert "added doctest in Sudoku for AC4 and and the possibility of choosing the constant propagation algorithm in mac inference"

This reverts commit 6986247481a05f1e558b93b2bf3cdae395f9c4ee.

* Revert "added the mentioned AC4 algorithm for constraint propagation"

This reverts commit 03551fbf2aa3980b915d4b6fefcbc70f24547b03.

* added map coloring SAT problem

* fixed build error

* Revert "added map coloring SAT problem"

This reverts commit 93af259e4811ddd775429f8a334111b9dd9e268c.

* Revert "fixed build error"

This reverts commit 6641c2c861728f3d43d3931ef201c6f7093cbc96.

* added map coloring SAT problem

* removed redundant parentheses

* added Viterbi algorithm

* added monkey & bananas planning problem

* simplified condition in search.py

* added tests for monkey & bananas planning problem

* removed monkey & bananas planning problem

* Revert "removed monkey & bananas planning problem"

This reverts commit 9d37ae0def15b9e058862cb465da13d2eb926968.

* Revert "added tests for monkey & bananas planning problem"

This reverts commit 24041e9a1a0ab936f7a2608e3662c8efec559382.

* Revert "simplified condition in search.py"

This reverts commit 6d229ce9bde5033802aca29ad3047f37ee6d870d.

* Revert "added monkey & bananas planning problem"

This reverts commit c74933a8905de7bb569bcaed7230930780560874.

* defined the PlanningProblem as a specialization of a search.Problem & fixed typo errors

* fixed doctest in logic.py

* fixed doctest for cascade_distribution

* added ForwardPlanner and tests

* added __lt__ implementation for Expr

* added more tests

* renamed forward planner

* Revert "renamed forward planner"

This reverts commit c4139e50e3a75a036607f4627717d70ad0919554.

* renamed forward planner class & added doc

* added backward planner and tests

* fixed mdp4e.py doctests

* removed ignore_delete_lists_heuristic flag

* fixed heuristic for forward and backward planners

* added SATPlan and tests

* fixed ignore delete lists heuristic in forward and backward planners

* fixed backward planner and added tests

* updated doc

* added nary csp definition and examples

* added CSPlan and tests

* fixed CSPlan

* added book's cryptarithmetic puzzle example

* fixed typo errors in test_csp

* fixed #1111

* added sortedcontainers to yml and doc to CSPlan

* added tests for n-ary csp

* fixed utils.extend

* updated test_probability.py

* converted static methods to functions

* added AC3b and AC4 with heuristic and tests

* added conflict-driven clause learning sat solver

* added tests for cdcl and heuristics

* fixed probability.py

* fixed import

* fixed kakuro

* added Martelli and Montanari rule-based unification algorithm

* removed duplicate standardize_variables

* renamed variables known as built-in functions

* fixed typos in learning.py

* renamed some files and fixed typos

* fixed typos

* fixed typos

* fixed tests

* removed unify_mm

* remove unnecessary brackets

* fixed tests

* moved utility functions to utils.py

* fixed typos

* moved utils function to utils.py, separated probability learning classes from learning.py, fixed typos and fixed imports in .ipynb files

* added missing learners

* fixed Travis build

* fixed typos

* fixed typos

* fixed typos

* fixed typos

* fixed typos in agents files

* fixed imports in agent files
---
 agents.py                            |   14 +-
 agents4e.py                          |    6 +-
 csp.ipynb                            |   13 +-
 deep_learning4e.py                   |  142 ++--
 knowledge.py                         |    6 +-
 knowledge_FOIL.ipynb                 |   14 +-
 learning.ipynb                       |   12 +-
 learning.py                          | 1100 +++++++++++---------------
 learning4e.py                        |  762 +++++++++---------
 learning_apps.ipynb                  |   12 +-
 logic.py                             |   20 +-
 probabilistic_learning.py            |  154 ++++
 reinforcement_learning.ipynb         |   13 +-
 requirements.txt                     |    2 +-
 tests/test_agents.py                 |   54 +-
 tests/test_agents4e.py               |   51 +-
 tests/test_deep_learning4e.py        |   41 +-
 tests/test_learning.py               |  157 ++--
 tests/test_learning4e.py             |   76 +-
 tests/test_probabilistic_learning.py |   38 +
 tests/test_utils.py                  |   55 +-
 text.py                              |    2 +-
 utils.py                             |   73 +-
 utils4e.py                           |    2 +-
 24 files changed, 1400 insertions(+), 1419 deletions(-)
 create mode 100644 probabilistic_learning.py
 create mode 100644 tests/test_probabilistic_learning.py

diff --git a/agents.py b/agents.py
index 0cab77eb2..6c01aa5b4 100644
--- a/agents.py
+++ b/agents.py
@@ -333,8 +333,7 @@ def run(self, steps=1000):
 
     def list_things_at(self, location, tclass=Thing):
         """Return all things exactly at a given location."""
-        return [thing for thing in self.things
-                if thing.location == location and isinstance(thing, tclass)]
+        return [thing for thing in self.things if thing.location == location and isinstance(thing, tclass)]
 
     def some_things_at(self, location, tclass=Thing):
         """Return true if at least one of the things at location
@@ -993,9 +992,8 @@ def is_done(self):
             else:
                 print("Death by {} [-1000].".format(explorer[0].killed_by))
         else:
-            print("Explorer climbed out {}."
-                .format(
-                "with Gold [+1000]!" if Gold() not in self.things else "without Gold [+0]"))
+            print("Explorer climbed out {}.".format("with Gold [+1000]!"
+                                                    if Gold() not in self.things else "without Gold [+0]"))
         return True
 
     # TODO: Arrow needs to be implemented
@@ -1012,9 +1010,9 @@ def compare_agents(EnvFactory, AgentFactories, n=10, steps=1000):
     >>> environment = TrivialVacuumEnvironment
     >>> agents = [ModelBasedVacuumAgent, ReflexVacuumAgent]
     >>> result = compare_agents(environment, agents)
-    >>> performance_ModelBasedVacummAgent = result[0][1]
-    >>> performance_ReflexVacummAgent = result[1][1]
-    >>> performance_ReflexVacummAgent <= performance_ModelBasedVacummAgent
+    >>> performance_ModelBasedVacuumAgent = result[0][1]
+    >>> performance_ReflexVacuumAgent = result[1][1]
+    >>> performance_ReflexVacuumAgent <= performance_ModelBasedVacuumAgent
     True
     """
     envs = [EnvFactory() for i in range(n)]
diff --git a/agents4e.py b/agents4e.py
index c25397783..fab36a46c 100644
--- a/agents4e.py
+++ b/agents4e.py
@@ -1012,9 +1012,9 @@ def compare_agents(EnvFactory, AgentFactories, n=10, steps=1000):
     >>> environment = TrivialVacuumEnvironment
     >>> agents = [ModelBasedVacuumAgent, ReflexVacuumAgent]
     >>> result = compare_agents(environment, agents)
-    >>> performance_ModelBasedVacummAgent = result[0][1]
-    >>> performance_ReflexVacummAgent = result[1][1]
-    >>> performance_ReflexVacummAgent <= performance_ModelBasedVacummAgent
+    >>> performance_ModelBasedVacuumAgent = result[0][1]
+    >>> performance_ReflexVacuumAgent = result[1][1]
+    >>> performance_ReflexVacuumAgent <= performance_ModelBasedVacuumAgent
     True
     """
     envs = [EnvFactory() for i in range(n)]
diff --git a/csp.ipynb b/csp.ipynb
index 163cc6b1e..5d490846b 100644
--- a/csp.ipynb
+++ b/csp.ipynb
@@ -16,7 +16,7 @@
    "outputs": [],
    "source": [
     "from csp import *\n",
-    "from notebook import psource, pseudocode, plot_NQueens\n",
+    "from notebook import psource, plot_NQueens\n",
     "%matplotlib inline\n",
     "\n",
     "# Hide warnings in the matplotlib sections\n",
@@ -3068,8 +3068,17 @@
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
    "version": "3.6.8"
+  },
+  "pycharm": {
+   "stem_cell": {
+    "cell_type": "raw",
+    "source": [],
+    "metadata": {
+     "collapsed": false
+    }
+   }
   }
  },
  "nbformat": 4,
  "nbformat_minor": 1
-}
+}
\ No newline at end of file
diff --git a/deep_learning4e.py b/deep_learning4e.py
index dadf19d6b..18c41f54e 100644
--- a/deep_learning4e.py
+++ b/deep_learning4e.py
@@ -1,3 +1,5 @@
+"""Deep learning. (Chapters 20)"""
+
 import math
 import random
 import statistics
@@ -8,24 +10,20 @@
 from keras.models import Sequential
 from keras.preprocessing import sequence
 
-from utils4e import sigmoid, dotproduct, softmax1D, conv1D, GaussianKernel, element_wise_product, \
-     vector_add, random_weights, scalar_vector_product, matrix_multiplication, map_vector, mse_loss
-
-
-# DEEP NEURAL NETWORKS. (Chapter 19)
-# ________________________________________________
-# 19.3 Models
-# 19.3.1 Computational Graphs and Layers
+from utils4e import (sigmoid, dotproduct, softmax1D, conv1D, GaussianKernel, element_wise_product, vector_add,
+                     random_weights, scalar_vector_product, matrix_multiplication, map_vector, mse_loss)
 
 
 class Node:
     """
-    A node in a computational graph. Contains the pointer to all its parents.
+    A node in a computational graph contains the pointer to all its parents.
     :param val: value of current node.
     :param parents: a container of all parents of current node.
     """
 
-    def __init__(self, val=None, parents=[]):
+    def __init__(self, val=None, parents=None):
+        if parents is None:
+            parents = []
         self.val = val
         self.parents = parents
 
@@ -35,7 +33,7 @@ def __repr__(self):
 
 class NNUnit(Node):
     """
-    A single unit of a layer in a Neural Network
+    A single unit of a layer in a neural network
     :param weights: weights between parent nodes and current node
     :param value: value of current node
     """
@@ -59,11 +57,8 @@ def forward(self, inputs):
         raise NotImplementedError
 
 
-# 19.3.2 Output Layers
-
-
 class OutputLayer(Layer):
-    """Example of a 1D softmax output layer in 19.3.2"""
+    """1D softmax output layer in 19.3.2"""
 
     def __init__(self, size=3):
         super(OutputLayer, self).__init__(size)
@@ -77,7 +72,7 @@ def forward(self, inputs):
 
 
 class InputLayer(Layer):
-    """Example of a 1D input layer. Layer size is the same as input vector size."""
+    """1D input layer. Layer size is the same as input vector size."""
 
     def __init__(self, size=3):
         super(InputLayer, self).__init__(size)
@@ -90,9 +85,6 @@ def forward(self, inputs):
         return inputs
 
 
-# 19.3.3 Hidden Layers
-
-
 class DenseLayer(Layer):
     """
     1D dense layer in a neural network.
@@ -121,9 +113,6 @@ def forward(self, inputs):
         return res
 
 
-# 19.3.4 Convolutional networks
-
-
 class ConvLayer1D(Layer):
     """
     1D convolution layer of in neural network.
@@ -137,10 +126,10 @@ def __init__(self, size=3, kernel_size=3):
             node.weights = GaussianKernel(kernel_size)
 
     def forward(self, features):
-        # Each node in layer takes a channel in the features.
+        # each node in layer takes a channel in the features.
         assert len(self.nodes) == len(features)
         res = []
-        # compute the convolution output of each channel, store it in node.val.
+        # compute the convolution output of each channel, store it in node.val
         for node, feature in zip(self.nodes, features):
             out = conv1D(feature, node.weights)
             res.append(out)
@@ -148,12 +137,11 @@ def forward(self, features):
         return res
 
 
-# 19.3.5 Pooling and Downsampling
-
-
 class MaxPoolingLayer1D(Layer):
-    """1D max pooling layer in a neural network.
-    :param kernel_size: max pooling area size"""
+    """
+    1D max pooling layer in a neural network.
+    :param kernel_size: max pooling area size
+    """
 
     def __init__(self, size=3, kernel_size=3):
         super(MaxPoolingLayer1D, self).__init__(size)
@@ -174,38 +162,30 @@ def forward(self, features):
         return res
 
 
-# ____________________________________________________________________
-# 19.4 optimization algorithms
-
-
 def init_examples(examples, idx_i, idx_t, o_units):
     """Init examples from dataset.examples."""
 
     inputs, targets = {}, {}
-    # random.shuffle(examples)
     for i, e in enumerate(examples):
-        # Input values of e
+        # input values of e
         inputs[i] = [e[i] for i in idx_i]
 
         if o_units > 1:
-            # One-Hot representation of e's target
+            # one-hot representation of e's target
             t = [0 for i in range(o_units)]
             t[e[idx_t]] = 1
             targets[i] = t
         else:
-            # Target value of e
+            # target value of e
             targets[i] = [e[idx_t]]
 
     return inputs, targets
 
 
-# 19.4.1 Stochastic gradient descent
-
-
 def gradient_descent(dataset, net, loss, epochs=1000, l_rate=0.01, batch_size=1, verbose=None):
     """
-    gradient descent algorithm to update the learnable parameters of a network.
-    :return: the updated network.
+    Gradient descent algorithm to update the learnable parameters of a network.
+    :return: the updated network
     """
     examples = dataset.examples # init data
 
@@ -233,13 +213,11 @@ def gradient_descent(dataset, net, loss, epochs=1000, l_rate=0.01, batch_size=1,
     return net
 
 
-# 19.4.2 Other gradient-based optimization algorithms
-
-
-def adam_optimizer(dataset, net, loss, epochs=1000, rho=(0.9, 0.999), delta=1 / 10 ** 8, l_rate=0.001, batch_size=1,
-                   verbose=None):
+def adam_optimizer(dataset, net, loss, epochs=1000, rho=(0.9, 0.999), delta=1 / 10 ** 8,
+                   l_rate=0.001, batch_size=1, verbose=None):
     """
-    Adam optimizer in Figure 19.6 to update the learnable parameters of a network.
+    [Figure 19.6]
+    Adam optimizer to update the learnable parameters of a network.
     Required parameters are similar to gradient descent.
     :return the updated network
     """
@@ -292,14 +270,11 @@ def adam_optimizer(dataset, net, loss, epochs=1000, rho=(0.9, 0.999), delta=1 /
     return net
 
 
-# 19.4.3 Back-propagation
-
-
 def BackPropagation(inputs, targets, theta, net, loss):
     """
     The back-propagation algorithm for multilayer networks in only one epoch, to calculate gradients of theta
-    :param inputs: A batch of inputs in an array. Each input is an iterable object.
-    :param targets: A batch of targets in an array. Each target is an iterable object.
+    :param inputs: a batch of inputs in an array. Each input is an iterable object.
+    :param targets: a batch of targets in an array. Each target is an iterable object.
     :param theta: parameters to be updated.
     :param net: a list of predefined layer objects representing their linear sequence.
     :param loss: a predefined loss function taking array of inputs and targets.
@@ -321,19 +296,19 @@ def BackPropagation(inputs, targets, theta, net, loss):
         i_val = inputs[e]
         t_val = targets[e]
 
-        # Forward pass and compute batch loss
+        # forward pass and compute batch loss
         for i in range(1, n_layers):
             layer_out = net[i].forward(i_val)
             i_val = layer_out
         batch_loss += loss(t_val, layer_out)
 
-        # Initialize delta
+        # initialize delta
         delta = [[] for _ in range(n_layers)]
 
         previous = [layer_out[i] - t_val[i] for i in range(o_units)]
         h_layers = n_layers - 1
-        
-        # Backward pass
+
+        # backward pass
         for i in range(h_layers, 0, -1):
             layer = net[i]
             derivative = [layer.activation.derivative(node.val) for node in layer.nodes]
@@ -349,11 +324,8 @@ def BackPropagation(inputs, targets, theta, net, loss):
     return total_gradients, batch_loss
 
 
-# 19.4.5 Batch normalization
-
-
 class BatchNormalizationLayer(Layer):
-    """Example of a batch normalization layer."""
+    """Batch normalization layer."""
 
     def __init__(self, size, epsilon=0.001):
         super(BatchNormalizationLayer, self).__init__(size)
@@ -378,19 +350,20 @@ def forward(self, inputs):
 
 
 def get_batch(examples, batch_size=1):
-    """split examples into multiple batches"""
+    """Split examples into multiple batches"""
     for i in range(0, len(examples), batch_size):
         yield examples[i: i + batch_size]
 
 
-# example of NNs
-
-
-def neural_net_learner(dataset, hidden_layer_sizes=[4], learning_rate=0.01, epochs=100, optimizer=gradient_descent,
-                       batch_size=1, verbose=None):
-    """Example of a simple dense multilayer neural network.
-    :param hidden_layer_sizes: size of hidden layers in the form of a list"""
+def NeuralNetLearner(dataset, hidden_layer_sizes=None, learning_rate=0.01, epochs=100,
+                     optimizer=gradient_descent, batch_size=1, verbose=None):
+    """
+    Simple dense multilayer neural network.
+    :param hidden_layer_sizes: size of hidden layers in the form of a list
+    """
 
+    if hidden_layer_sizes is None:
+        hidden_layer_sizes = [4]
     input_size = len(dataset.inputs)
     output_size = len(dataset.values[dataset.target])
 
@@ -404,8 +377,8 @@ def neural_net_learner(dataset, hidden_layer_sizes=[4], learning_rate=0.01, epoc
     raw_net.append(DenseLayer(hidden_input_size, output_size))
 
     # update parameters of the network
-    learned_net = optimizer(dataset, raw_net, mse_loss, epochs, l_rate=learning_rate, batch_size=batch_size,
-                            verbose=verbose)
+    learned_net = optimizer(dataset, raw_net, mse_loss, epochs, l_rate=learning_rate,
+                            batch_size=batch_size, verbose=verbose)
 
     def predict(example):
         n_layers = len(learned_net)
@@ -423,9 +396,9 @@ def predict(example):
     return predict
 
 
-def perceptron_learner(dataset, learning_rate=0.01, epochs=100, verbose=None):
+def PerceptronLearner(dataset, learning_rate=0.01, epochs=100, verbose=None):
     """
-    Example of a simple perceptron neural network.
+    Simple perceptron neural network.
     """
     input_size = len(dataset.inputs)
     output_size = len(dataset.values[dataset.target])
@@ -443,17 +416,14 @@ def predict(example):
     return predict
 
 
-# ____________________________________________________________________
-# 19.6 Recurrent neural networks
-
-
-def simple_rnn_learner(train_data, val_data, epochs=2):
+def SimpleRNNLearner(train_data, val_data, epochs=2):
     """
-    rnn example for text sentimental analysis
+    RNN example for text sentimental analysis.
     :param train_data: a tuple of (training data, targets)
             Training data: ndarray taking training examples, while each example is coded by embedding
-            Targets: ndarry taking targets of each example. Each target is mapped to an integer.
+            Targets: ndarray taking targets of each example. Each target is mapped to an integer.
     :param val_data: a tuple of (validation data, targets)
+    :param epochs: number of epochs
     :return: a keras model
     """
 
@@ -479,7 +449,7 @@ def simple_rnn_learner(train_data, val_data, epochs=2):
 
 def keras_dataset_loader(dataset, max_length=500):
     """
-    helper function to load keras datasets
+    Helper function to load keras datasets.
     :param dataset: keras data set type
     :param max_length: max length of each input sequence
     """
@@ -491,10 +461,14 @@ def keras_dataset_loader(dataset, max_length=500):
     return (X_train[10:], y_train[10:]), (X_val, y_val), (X_train[:10], y_train[:10])
 
 
-def auto_encoder_learner(inputs, encoding_size, epochs=200):
-    """simple example of linear auto encoder learning producing the input itself.
+def AutoencoderLearner(inputs, encoding_size, epochs=200):
+    """
+    Simple example of linear auto encoder learning producing the input itself.
     :param inputs: a batch of input data in np.ndarray type
-    :param encoding_size: int, the size of encoding layer"""
+    :param encoding_size: int, the size of encoding layer
+    :param epochs: number of epochs
+    :return: a keras model
+    """
 
     # init data
     input_size = len(inputs[0])
diff --git a/knowledge.py b/knowledge.py
index d237090ee..eaeacf7d9 100644
--- a/knowledge.py
+++ b/knowledge.py
@@ -1,4 +1,4 @@
-"""Knowledge in learning, Chapter 19"""
+"""Knowledge in learning (Chapter 19)"""
 
 from random import shuffle
 from math import log
@@ -13,10 +13,12 @@
 # ______________________________________________________________________________
 
 
-def current_best_learning(examples, h, examples_so_far=[]):
+def current_best_learning(examples, h, examples_so_far=None):
     """ [Figure 19.2]
     The hypothesis is a list of dictionaries, with each dictionary representing
     a disjunction."""
+    if examples_so_far is None:
+        examples_so_far = []
     if not examples:
         return h
 
diff --git a/knowledge_FOIL.ipynb b/knowledge_FOIL.ipynb
index 63e943416..4cefd7f69 100644
--- a/knowledge_FOIL.ipynb
+++ b/knowledge_FOIL.ipynb
@@ -18,8 +18,7 @@
    "outputs": [],
    "source": [
     "from knowledge import *\n",
-    "\n",
-    "from notebook import pseudocode, psource"
+    "from notebook import psource"
    ]
   },
   {
@@ -624,8 +623,17 @@
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
    "version": "3.6.5"
+  },
+  "pycharm": {
+   "stem_cell": {
+    "cell_type": "raw",
+    "source": [],
+    "metadata": {
+     "collapsed": false
+    }
+   }
   }
  },
  "nbformat": 4,
  "nbformat_minor": 2
-}
+}
\ No newline at end of file
diff --git a/learning.ipynb b/learning.ipynb
index aecd5d2d3..0cadd4e7b 100644
--- a/learning.ipynb
+++ b/learning.ipynb
@@ -16,6 +16,7 @@
    "outputs": [],
    "source": [
     "from learning import *\n",
+    "from probabilistic_learning import *\n",
     "from notebook import *"
    ]
   },
@@ -2247,8 +2248,17 @@
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
    "version": "3.5.2"
+  },
+  "pycharm": {
+   "stem_cell": {
+    "cell_type": "raw",
+    "source": [],
+    "metadata": {
+     "collapsed": false
+    }
+   }
   }
  },
  "nbformat": 4,
  "nbformat_minor": 2
-}
+}
\ No newline at end of file
diff --git a/learning.py b/learning.py
index 7fe536f96..31aabe30f 100644
--- a/learning.py
+++ b/learning.py
@@ -1,4 +1,4 @@
-"""Learn to estimate functions from examples. (Chapters 18, 20)"""
+"""Learning from examples. (Chapters 18)"""
 
 import copy
 import heapq
@@ -7,46 +7,46 @@
 from collections import defaultdict
 from statistics import mean, stdev
 
-from utils import (
-    removeall, unique, product, mode, argmax, argmax_random_tie, isclose, gaussian,
-    dotproduct, vector_add, scalar_vector_product, weighted_sample_with_replacement,
-    weighted_sampler, num_or_str, normalize, clip, sigmoid, print_table,
-    open_data, sigmoid_derivative, probability, norm, matrix_multiplication, relu, relu_derivative,
-    tanh, tanh_derivative, leaky_relu_derivative, elu, elu_derivative,
-    mean_boolean_error)
+from probabilistic_learning import NaiveBayesLearner
+from utils import (remove_all, unique, mode, argmax, argmax_random_tie, isclose, dotproduct, vector_add,
+                   scalar_vector_product, weighted_sample_with_replacement, num_or_str, normalize, clip, sigmoid,
+                   print_table, open_data, sigmoid_derivative, probability, relu, relu_derivative, tanh,
+                   tanh_derivative, leaky_relu_derivative, elu, elu_derivative, mean_boolean_error, random_weights)
 
 
 class DataSet:
-    """A data set for a machine learning problem. It has the following fields:
+    """
+    A data set for a machine learning problem. It has the following fields:
 
     d.examples   A list of examples. Each one is a list of attribute values.
     d.attrs      A list of integers to index into an example, so example[attr]
                  gives a value. Normally the same as range(len(d.examples[0])).
-    d.attrnames  Optional list of mnemonic names for corresponding attrs.
+    d.attr_names Optional list of mnemonic names for corresponding attrs.
     d.target     The attribute that a learning algorithm will try to predict.
                  By default the final attribute.
     d.inputs     The list of attrs without the target.
     d.values     A list of lists: each sublist is the set of possible
                  values for the corresponding attribute. If initially None,
-                 it is computed from the known examples by self.setproblem.
+                 it is computed from the known examples by self.set_problem.
                  If not None, an erroneous value raises ValueError.
-    d.distance   A function from a pair of examples to a nonnegative number.
+    d.distance   A function from a pair of examples to a non-negative number.
                  Should be symmetric, etc. Defaults to mean_boolean_error
                  since that can handle any field types.
     d.name       Name of the data set (for output display only).
     d.source     URL or other source where the data came from.
     d.exclude    A list of attribute indexes to exclude from d.inputs. Elements
-                 of this list can either be integers (attrs) or attrnames.
+                 of this list can either be integers (attrs) or attr_names.
 
     Normally, you call the constructor and you're done; then you just
-    access fields like d.examples and d.target and d.inputs."""
+    access fields like d.examples and d.target and d.inputs.
+    """
 
-    def __init__(self, examples=None, attrs=None, attrnames=None, target=-1,
-                 inputs=None, values=None, distance=mean_boolean_error,
-                 name='', source='', exclude=()):
-        """Accepts any of DataSet's fields. Examples can also be a
+    def __init__(self, examples=None, attrs=None, attr_names=None, target=-1, inputs=None,
+                 values=None, distance=mean_boolean_error, name='', source='', exclude=()):
+        """
+        Accepts any of DataSet's fields. Examples can also be a
         string or file from which to parse examples using parse_csv.
-        Optional parameter: exclude, as documented in .setproblem().
+        Optional parameter: exclude, as documented in .set_problem().
         >>> DataSet(examples='1, 2, 3')
         <DataSet(): 1 examples, 3 attributes>
         """
@@ -56,7 +56,7 @@ def __init__(self, examples=None, attrs=None, attrnames=None, target=-1,
         self.distance = distance
         self.got_values_flag = bool(values)
 
-        # Initialize .examples from string or list or data directory
+        # initialize .examples from string or list or data directory
         if isinstance(examples, str):
             self.examples = parse_csv(examples)
         elif examples is None:
@@ -64,39 +64,40 @@ def __init__(self, examples=None, attrs=None, attrnames=None, target=-1,
         else:
             self.examples = examples
 
-        # Attrs are the indices of examples, unless otherwise stated.   
+        # attrs are the indices of examples, unless otherwise stated.
         if self.examples is not None and attrs is None:
             attrs = list(range(len(self.examples[0])))
 
         self.attrs = attrs
 
-        # Initialize .attrnames from string, list, or by default
-        if isinstance(attrnames, str):
-            self.attrnames = attrnames.split()
+        # initialize .attr_names from string, list, or by default
+        if isinstance(attr_names, str):
+            self.attr_names = attr_names.split()
         else:
-            self.attrnames = attrnames or attrs
-        self.setproblem(target, inputs=inputs, exclude=exclude)
+            self.attr_names = attr_names or attrs
+        self.set_problem(target, inputs=inputs, exclude=exclude)
 
-    def setproblem(self, target, inputs=None, exclude=()):
-        """Set (or change) the target and/or inputs.
+    def set_problem(self, target, inputs=None, exclude=()):
+        """
+        Set (or change) the target and/or inputs.
         This way, one DataSet can be used multiple ways. inputs, if specified,
         is a list of attributes, or specify exclude as a list of attributes
-        to not use in inputs. Attributes can be -n .. n, or an attrname.
-        Also computes the list of possible values, if that wasn't done yet."""
-        self.target = self.attrnum(target)
-        exclude = list(map(self.attrnum, exclude))
+        to not use in inputs. Attributes can be -n .. n, or an attr_name.
+        Also computes the list of possible values, if that wasn't done yet.
+        """
+        self.target = self.attr_num(target)
+        exclude = list(map(self.attr_num, exclude))
         if inputs:
-            self.inputs = removeall(self.target, inputs)
+            self.inputs = remove_all(self.target, inputs)
         else:
-            self.inputs = [a for a in self.attrs
-                           if a != self.target and a not in exclude]
+            self.inputs = [a for a in self.attrs if a != self.target and a not in exclude]
         if not self.values:
             self.update_values()
         self.check_me()
 
     def check_me(self):
         """Check that my fields make sense."""
-        assert len(self.attrnames) == len(self.attrs)
+        assert len(self.attr_names) == len(self.attrs)
         assert self.target in self.attrs
         assert self.target not in self.inputs
         assert set(self.inputs).issubset(set(self.attrs))
@@ -115,12 +116,12 @@ def check_example(self, example):
             for a in self.attrs:
                 if example[a] not in self.values[a]:
                     raise ValueError('Bad value {} for attribute {} in {}'
-                                     .format(example[a], self.attrnames[a], example))
+                                     .format(example[a], self.attr_names[a], example))
 
-    def attrnum(self, attr):
+    def attr_num(self, attr):
         """Returns the number used for attr, which can be a name, or -n .. n-1."""
         if isinstance(attr, str):
-            return self.attrnames.index(attr)
+            return self.attr_names.index(attr)
         elif attr < 0:
             return len(self.attrs) + attr
         else:
@@ -131,13 +132,12 @@ def update_values(self):
 
     def sanitize(self, example):
         """Return a copy of example, with non-input attributes replaced by None."""
-        return [attr_i if i in self.inputs else None
-                for i, attr_i in enumerate(example)]
+        return [attr_i if i in self.inputs else None for i, attr_i in enumerate(example)]
 
     def classes_to_numbers(self, classes=None):
         """Converts class names to numbers."""
         if not classes:
-            # If classes were not given, extract them from values
+            # if classes were not given, extract them from values
             classes = sorted(self.values[self.target])
         for item in self.examples:
             item[self.target] = classes.index(item[self.target])
@@ -153,17 +153,19 @@ def split_values_by_classes(self):
         target_names = self.values[self.target]
 
         for v in self.examples:
-            item = [a for a in v if a not in target_names]  # Remove target from item
-            buckets[v[self.target]].append(item)  # Add item to bucket of its class
+            item = [a for a in v if a not in target_names]  # remove target from item
+            buckets[v[self.target]].append(item)  # add item to bucket of its class
 
         return buckets
 
     def find_means_and_deviations(self):
-        """Finds the means and standard deviations of self.dataset.
-        means     : A dictionary for each class/target. Holds a list of the means
+        """
+        Finds the means and standard deviations of self.dataset.
+        means     : a dictionary for each class/target. Holds a list of the means
                     of the features for the class.
-        deviations: A dictionary for each class/target. Holds a list of the sample
-                    standard deviations of the features for the class."""
+        deviations: a dictionary for each class/target. Holds a list of the sample
+                    standard deviations of the features for the class.
+        """
         target_names = self.values[self.target]
         feature_numbers = len(self.inputs)
 
@@ -173,13 +175,13 @@ def find_means_and_deviations(self):
         deviations = defaultdict(lambda: [0] * feature_numbers)
 
         for t in target_names:
-            # Find all the item feature values for item in class t
-            features = [[] for i in range(feature_numbers)]
+            # find all the item feature values for item in class t
+            features = [[] for _ in range(feature_numbers)]
             for item in item_buckets[t]:
                 for i in range(feature_numbers):
                     features[i].append(item[i])
 
-            # Calculate means and deviations fo the class
+            # calculate means and deviations fo the class
             for i in range(feature_numbers):
                 means[t][i] = mean(features[i])
                 deviations[t][i] = stdev(features[i])
@@ -187,285 +189,182 @@ def find_means_and_deviations(self):
         return means, deviations
 
     def __repr__(self):
-        return '<DataSet({}): {:d} examples, {:d} attributes>'.format(
-            self.name, len(self.examples), len(self.attrs))
-
-
-# ______________________________________________________________________________
+        return '<DataSet({}): {:d} examples, {:d} attributes>'.format(self.name, len(self.examples), len(self.attrs))
 
 
 def parse_csv(input, delim=','):
-    r"""Input is a string consisting of lines, each line has comma-delimited
+    r"""
+    Input is a string consisting of lines, each line has comma-delimited
     fields.  Convert this into a list of lists. Blank lines are skipped.
     Fields that look like numbers are converted to numbers.
     The delim defaults to ',' but '\t' and None are also reasonable values.
     >>> parse_csv('1, 2, 3 \n 0, 2, na')
-    [[1, 2, 3], [0, 2, 'na']]"""
+    [[1, 2, 3], [0, 2, 'na']]
+    """
     lines = [line for line in input.splitlines() if line.strip()]
     return [list(map(num_or_str, line.split(delim))) for line in lines]
 
 
-# ______________________________________________________________________________
-
-
-class CountingProbDist:
-    """A probability distribution formed by observing and counting examples.
-    If p is an instance of this class and o is an observed value, then
-    there are 3 main operations:
-    p.add(o) increments the count for observation o by 1.
-    p.sample() returns a random element from the distribution.
-    p[o] returns the probability for o (as in a regular ProbDist)."""
-
-    def __init__(self, observations=None, default=0):
-        """Create a distribution, and optionally add in some observations.
-        By default this is an unsmoothed distribution, but saying default=1,
-        for example, gives you add-one smoothing."""
-        if observations is None:
-            observations = []
-        self.dictionary = {}
-        self.n_obs = 0
-        self.default = default
-        self.sampler = None
-
-        for o in observations:
-            self.add(o)
-
-    def add(self, o):
-        """Add an observation o to the distribution."""
-        self.smooth_for(o)
-        self.dictionary[o] += 1
-        self.n_obs += 1
-        self.sampler = None
-
-    def smooth_for(self, o):
-        """Include o among the possible observations, whether or not
-        it's been observed yet."""
-        if o not in self.dictionary:
-            self.dictionary[o] = self.default
-            self.n_obs += self.default
-            self.sampler = None
-
-    def __getitem__(self, item):
-        """Return an estimate of the probability of item."""
-        self.smooth_for(item)
-        return self.dictionary[item] / self.n_obs
-
-    # (top() and sample() are not used in this module, but elsewhere.)
-
-    def top(self, n):
-        """Return (count, obs) tuples for the n most frequent observations."""
-        return heapq.nlargest(n, [(v, k) for (k, v) in self.dictionary.items()])
-
-    def sample(self):
-        """Return a random sample from the distribution."""
-        if self.sampler is None:
-            self.sampler = weighted_sampler(list(self.dictionary.keys()),
-                                            list(self.dictionary.values()))
-        return self.sampler()
-
-
-# ______________________________________________________________________________
-
-
-def PluralityLearner(dataset):
-    """A very dumb algorithm: always pick the result that was most popular
-    in the training data.  Makes a baseline for comparison."""
-    most_popular = mode([e[dataset.target] for e in dataset.examples])
-
-    def predict(example):
-        """Always return same result: the most popular from the training set."""
-        return most_popular
-
-    return predict
+def err_ratio(predict, dataset, examples=None, verbose=0):
+    """
+    Return the proportion of the examples that are NOT correctly predicted.
+    verbose - 0: No output; 1: Output wrong; 2 (or greater): Output correct
+    """
+    examples = examples or dataset.examples
+    if len(examples) == 0:
+        return 0.0
+    right = 0
+    for example in examples:
+        desired = example[dataset.target]
+        output = predict(dataset.sanitize(example))
+        if output == desired:
+            right += 1
+            if verbose >= 2:
+                print('   OK: got {} for {}'.format(desired, example))
+        elif verbose:
+            print('WRONG: got {}, expected {} for {}'.format(output, desired, example))
+    return 1 - (right / len(examples))
 
 
-# ______________________________________________________________________________
+def grade_learner(predict, tests):
+    """
+    Grades the given learner based on how many tests it passes.
+    tests is a list with each element in the form: (values, output).
+    """
+    return mean(int(predict(X) == y) for X, y in tests)
 
 
-def NaiveBayesLearner(dataset, continuous=True, simple=False):
-    if simple:
-        return NaiveBayesSimple(dataset)
-    if continuous:
-        return NaiveBayesContinuous(dataset)
+def train_test_split(dataset, start=None, end=None, test_split=None):
+    """
+    If you are giving 'start' and 'end' as parameters,
+    then it will return the testing set from index 'start' to 'end'
+    and the rest for training.
+    If you give 'test_split' as a parameter then it will return
+    test_split * 100% as the testing set and the rest as
+    training set.
+    """
+    examples = dataset.examples
+    if test_split is None:
+        train = examples[:start] + examples[end:]
+        val = examples[start:end]
     else:
-        return NaiveBayesDiscrete(dataset)
-
-
-def NaiveBayesSimple(distribution):
-    """A simple naive bayes classifier that takes as input a dictionary of
-    CountingProbDist objects and classifies items according to these distributions.
-    The input dictionary is in the following form:
-        (ClassName, ClassProb): CountingProbDist"""
-    target_dist = {c_name: prob for c_name, prob in distribution.keys()}
-    attr_dists = {c_name: count_prob for (c_name, _), count_prob in distribution.items()}
-
-    def predict(example):
-        """Predict the target value for example. Calculate probabilities for each
-        class and pick the max."""
-
-        def class_probability(targetval):
-            attr_dist = attr_dists[targetval]
-            return target_dist[targetval] * product(attr_dist[a] for a in example)
-
-        return argmax(target_dist.keys(), key=class_probability)
-
-    return predict
-
-
-def NaiveBayesDiscrete(dataset):
-    """Just count how many times each value of each input attribute
-    occurs, conditional on the target value. Count the different
-    target values too."""
-
-    target_vals = dataset.values[dataset.target]
-    target_dist = CountingProbDist(target_vals)
-    attr_dists = {(gv, attr): CountingProbDist(dataset.values[attr])
-                  for gv in target_vals
-                  for attr in dataset.inputs}
-    for example in dataset.examples:
-        targetval = example[dataset.target]
-        target_dist.add(targetval)
-        for attr in dataset.inputs:
-            attr_dists[targetval, attr].add(example[attr])
-
-    def predict(example):
-        """Predict the target value for example. Consider each possible value,
-        and pick the most likely by looking at each attribute independently."""
-
-        def class_probability(targetval):
-            return (target_dist[targetval] *
-                    product(attr_dists[targetval, attr][example[attr]]
-                            for attr in dataset.inputs))
+        total_size = len(examples)
+        val_size = int(total_size * test_split)
+        train_size = total_size - val_size
+        train = examples[:train_size]
+        val = examples[train_size:total_size]
 
-        return argmax(target_vals, key=class_probability)
+    return train, val
 
-    return predict
 
+def cross_validation_wrapper(learner, dataset, k=10, trials=1):
+    """
+    [Figure 18.8]
+    Return the optimal value of size having minimum error on validation set.
+    errT: a training error array, indexed by size
+    errV: a validation error array, indexed by size
+    """
+    errs = []
+    size = 1
+    while True:
+        errT, errV = cross_validation(learner, dataset, size, k, trials)
+        # check for convergence provided err_val is not empty
+        if errT and not isclose(errT[-1], errT, rel_tol=1e-6):
+            best_size = 0
+            min_val = math.inf
+            i = 0
+            while i < size:
+                if errs[i] < min_val:
+                    min_val = errs[i]
+                    best_size = i
+                i += 1
+            return learner(dataset, best_size)
+        errs.append(errV)
+        size += 1
 
-def NaiveBayesContinuous(dataset):
-    """Count how many times each target value occurs.
-    Also, find the means and deviations of input attribute values for each target value."""
-    means, deviations = dataset.find_means_and_deviations()
 
-    target_vals = dataset.values[dataset.target]
-    target_dist = CountingProbDist(target_vals)
+def cross_validation(learner, dataset, size=None, k=10, trials=1):
+    """
+    Do k-fold cross_validate and return their mean.
+    That is, keep out 1/k of the examples for testing on each of k runs.
+    Shuffle the examples first; if trials>1, average over several shuffles.
+    Returns Training error, Validation error
+    """
+    k = k or len(dataset.examples)
+    if trials > 1:
+        trial_errT = 0
+        trial_errV = 0
+        for t in range(trials):
+            errT, errV = cross_validation(learner, dataset, size, k, trials)
+            trial_errT += errT
+            trial_errV += errV
+        return trial_errT / trials, trial_errV / trials
+    else:
+        fold_errT = 0
+        fold_errV = 0
+        n = len(dataset.examples)
+        examples = dataset.examples
+        random.shuffle(dataset.examples)
+        for fold in range(k):
+            train_data, val_data = train_test_split(dataset, fold * (n // k), (fold + 1) * (n // k))
+            dataset.examples = train_data
+            h = learner(dataset, size)
+            fold_errT += err_ratio(h, dataset, train_data)
+            fold_errV += err_ratio(h, dataset, val_data)
+            # reverting back to original once test is completed
+            dataset.examples = examples
+        return fold_errT / k, fold_errV / k
 
-    def predict(example):
-        """Predict the target value for example. Consider each possible value,
-        and pick the most likely by looking at each attribute independently."""
 
-        def class_probability(targetval):
-            prob = target_dist[targetval]
-            for attr in dataset.inputs:
-                prob *= gaussian(means[targetval][attr], deviations[targetval][attr], example[attr])
-            return prob
+def leave_one_out(learner, dataset, size=None):
+    """Leave one out cross-validation over the dataset."""
+    return cross_validation(learner, dataset, size, len(dataset.examples))
 
-        return argmax(target_vals, key=class_probability)
 
-    return predict
+# TODO learning_curve needs to be fixed
+def learning_curve(learner, dataset, trials=10, sizes=None):
+    if sizes is None:
+        sizes = list(range(2, len(dataset.examples) - 10, 2))
 
+    def score(learner, size):
+        random.shuffle(dataset.examples)
+        return train_test_split(learner, dataset, 0, size)
 
-# ______________________________________________________________________________
+    return [(size, mean([score(learner, size) for _ in range(trials)])) for size in sizes]
 
 
-def NearestNeighborLearner(dataset, k=1):
-    """k-NearestNeighbor: the k nearest neighbors vote."""
+def PluralityLearner(dataset):
+    """
+    A very dumb algorithm: always pick the result that was most popular
+    in the training data. Makes a baseline for comparison.
+    """
+    most_popular = mode([e[dataset.target] for e in dataset.examples])
 
     def predict(example):
-        """Find the k closest items, and have them vote for the best."""
-        best = heapq.nsmallest(k, ((dataset.distance(e, example), e)
-                                   for e in dataset.examples))
-        return mode(e[dataset.target] for (d, e) in best)
+        """Always return same result: the most popular from the training set."""
+        return most_popular
 
     return predict
 
 
-# ______________________________________________________________________________
-
-
-def truncated_svd(X, num_val=2, max_iter=1000):
-    """Compute the first component of SVD."""
-
-    def normalize_vec(X, n=2):
-        """Normalize two parts (:m and m:) of the vector."""
-        X_m = X[:m]
-        X_n = X[m:]
-        norm_X_m = norm(X_m, n)
-        Y_m = [x / norm_X_m for x in X_m]
-        norm_X_n = norm(X_n, n)
-        Y_n = [x / norm_X_n for x in X_n]
-        return Y_m + Y_n
-
-    def remove_component(X):
-        """Remove components of already obtained eigen vectors from X."""
-        X_m = X[:m]
-        X_n = X[m:]
-        for eivec in eivec_m:
-            coeff = dotproduct(X_m, eivec)
-            X_m = [x1 - coeff * x2 for x1, x2 in zip(X_m, eivec)]
-        for eivec in eivec_n:
-            coeff = dotproduct(X_n, eivec)
-            X_n = [x1 - coeff * x2 for x1, x2 in zip(X_n, eivec)]
-        return X_m + X_n
-
-    m, n = len(X), len(X[0])
-    A = [[0] * (n + m) for _ in range(n + m)]
-    for i in range(m):
-        for j in range(n):
-            A[i][m + j] = A[m + j][i] = X[i][j]
-
-    eivec_m = []
-    eivec_n = []
-    eivals = []
-
-    for _ in range(num_val):
-        X = [random.random() for _ in range(m + n)]
-        X = remove_component(X)
-        X = normalize_vec(X)
-
-        for i in range(max_iter):
-            old_X = X
-            X = matrix_multiplication(A, [[x] for x in X])
-            X = [x[0] for x in X]
-            X = remove_component(X)
-            X = normalize_vec(X)
-            # check for convergence
-            if norm([x1 - x2 for x1, x2 in zip(old_X, X)]) <= 1e-10:
-                break
-
-        projected_X = matrix_multiplication(A, [[x] for x in X])
-        projected_X = [x[0] for x in projected_X]
-        new_eigenvalue = norm(projected_X, 1) / norm(X, 1)
-        ev_m = X[:m]
-        ev_n = X[m:]
-        if new_eigenvalue < 0:
-            new_eigenvalue = -new_eigenvalue
-            ev_m = [-ev_m_i for ev_m_i in ev_m]
-        eivals.append(new_eigenvalue)
-        eivec_m.append(ev_m)
-        eivec_n.append(ev_n)
-    return (eivec_m, eivec_n, eivals)
-
-
-# ______________________________________________________________________________
-
-
 class DecisionFork:
-    """A fork of a decision tree holds an attribute to test, and a dict
-    of branches, one for each of the attribute's values."""
+    """
+    A fork of a decision tree holds an attribute to test, and a dict
+    of branches, one for each of the attribute's values.
+    """
 
-    def __init__(self, attr, attrname=None, default_child=None, branches=None):
+    def __init__(self, attr, attr_name=None, default_child=None, branches=None):
         """Initialize by saying what attribute this node tests."""
         self.attr = attr
-        self.attrname = attrname or attr
+        self.attr_name = attr_name or attr
         self.default_child = default_child
         self.branches = branches or {}
 
     def __call__(self, example):
         """Given an example, classify it using the attribute and the branches."""
-        attrvalue = example[self.attr]
-        if attrvalue in self.branches:
-            return self.branches[attrvalue](example)
+        attr_val = example[self.attr]
+        if attr_val in self.branches:
+            return self.branches[attr_val](example)
         else:
             # return default class when attribute is unknown
             return self.default_child(example)
@@ -475,15 +374,14 @@ def add(self, val, subtree):
         self.branches[val] = subtree
 
     def display(self, indent=0):
-        name = self.attrname
+        name = self.attr_name
         print('Test', name)
         for (val, subtree) in self.branches.items():
             print(' ' * 4 * indent, name, '=', val, '==>', end=' ')
             subtree.display(indent + 1)
-        print()  # newline
 
     def __repr__(self):
-        return ('DecisionFork({0!r}, {1!r}, {2!r})'.format(self.attr, self.attrname, self.branches))
+        return 'DecisionFork({0!r}, {1!r}, {2!r})'.format(self.attr, self.attr_name, self.branches)
 
 
 class DecisionLeaf:
@@ -495,16 +393,13 @@ def __init__(self, result):
     def __call__(self, example):
         return self.result
 
-    def display(self, indent=0):
+    def display(self):
         print('RESULT =', self.result)
 
     def __repr__(self):
         return repr(self.result)
 
 
-# ______________________________________________________________________________
-
-
 def DecisionTreeLearner(dataset):
     """[Figure 18.5]"""
 
@@ -513,21 +408,22 @@ def DecisionTreeLearner(dataset):
     def decision_tree_learning(examples, attrs, parent_examples=()):
         if len(examples) == 0:
             return plurality_value(parent_examples)
-        elif all_same_class(examples):
+        if all_same_class(examples):
             return DecisionLeaf(examples[0][target])
-        elif len(attrs) == 0:
+        if len(attrs) == 0:
             return plurality_value(examples)
-        else:
-            A = choose_attribute(attrs, examples)
-            tree = DecisionFork(A, dataset.attrnames[A], plurality_value(examples))
-            for (v_k, exs) in split_by(A, examples):
-                subtree = decision_tree_learning(exs, removeall(A, attrs), examples)
-                tree.add(v_k, subtree)
-            return tree
+        A = choose_attribute(attrs, examples)
+        tree = DecisionFork(A, dataset.attr_names[A], plurality_value(examples))
+        for (v_k, exs) in split_by(A, examples):
+            subtree = decision_tree_learning(exs, remove_all(A, attrs), examples)
+            tree.add(v_k, subtree)
+        return tree
 
     def plurality_value(examples):
-        """Return the most popular target value for this set of examples.
-        (If target is binary, this is the majority; otherwise plurality.)"""
+        """
+        Return the most popular target value for this set of examples.
+        (If target is binary, this is the majority; otherwise plurality).
+        """
         popular = argmax_random_tie(values[target], key=lambda v: count(target, v, examples))
         return DecisionLeaf(popular)
 
@@ -548,64 +444,30 @@ def information_gain(attr, examples):
         """Return the expected reduction in entropy from splitting by attr."""
 
         def I(examples):
-            return information_content([count(target, v, examples)
-                                        for v in values[target]])
+            return information_content([count(target, v, examples) for v in values[target]])
 
         N = len(examples)
-        remainder = sum((len(examples_i) / N) * I(examples_i)
-                        for (v, examples_i) in split_by(attr, examples))
+        remainder = sum((len(examples_i) / N) * I(examples_i) for (v, examples_i) in split_by(attr, examples))
         return I(examples) - remainder
 
     def split_by(attr, examples):
         """Return a list of (val, examples) pairs for each val of attr."""
-        return [(v, [e for e in examples if e[attr] == v])
-                for v in values[attr]]
+        return [(v, [e for e in examples if e[attr] == v]) for v in values[attr]]
 
     return decision_tree_learning(dataset.examples, dataset.inputs)
 
 
 def information_content(values):
     """Number of bits to represent the probability distribution in values."""
-    probabilities = normalize(removeall(0, values))
+    probabilities = normalize(remove_all(0, values))
     return sum(-p * math.log2(p) for p in probabilities)
 
 
-# ______________________________________________________________________________
-
-
-def RandomForest(dataset, n=5):
-    """An ensemble of Decision Trees trained using bagging and feature bagging."""
-
-    def data_bagging(dataset, m=0):
-        """Sample m examples with replacement"""
-        n = len(dataset.examples)
-        return weighted_sample_with_replacement(m or n, dataset.examples, [1] * n)
-
-    def feature_bagging(dataset, p=0.7):
-        """Feature bagging with probability p to retain an attribute"""
-        inputs = [i for i in dataset.inputs if probability(p)]
-        return inputs or dataset.inputs
-
-    def predict(example):
-        print([predictor(example) for predictor in predictors])
-        return mode(predictor(example) for predictor in predictors)
-
-    predictors = [DecisionTreeLearner(DataSet(examples=data_bagging(dataset),
-                                              attrs=dataset.attrs,
-                                              attrnames=dataset.attrnames,
-                                              target=dataset.target,
-                                              inputs=feature_bagging(dataset))) for _ in range(n)]
-
-    return predict
-
-
-# ______________________________________________________________________________
-
-# A decision list is implemented as a list of (test, value) pairs.
-
-
 def DecisionListLearner(dataset):
-    """[Figure 18.11]"""
+    """
+    [Figure 18.11]
+    A decision list implemented as a list of (test, value) pairs.
+    """
 
     def decision_list_learning(examples):
         if not examples:
@@ -616,8 +478,10 @@ def decision_list_learning(examples):
         return [(t, o)] + decision_list_learning(examples - examples_t)
 
     def find_examples(examples):
-        """Find a set of examples that all have the same outcome under
-        some test. Return a tuple of the test, outcome, and examples."""
+        """
+        Find a set of examples that all have the same outcome under
+        some test. Return a tuple of the test, outcome, and examples.
+        """
         raise NotImplementedError
 
     def passes(example, test):
@@ -635,16 +499,112 @@ def predict(example):
     return predict
 
 
-# ______________________________________________________________________________
+def NearestNeighborLearner(dataset, k=1):
+    """k-NearestNeighbor: the k nearest neighbors vote."""
+
+    def predict(example):
+        """Find the k closest items, and have them vote for the best."""
+        best = heapq.nsmallest(k, ((dataset.distance(e, example), e) for e in dataset.examples))
+        return mode(e[dataset.target] for (d, e) in best)
+
+    return predict
+
+
+def LinearLearner(dataset, learning_rate=0.01, epochs=100):
+    """
+    [Section 18.6.3]
+    Linear classifier with hard threshold.
+    """
+    idx_i = dataset.inputs
+    idx_t = dataset.target
+    examples = dataset.examples
+    num_examples = len(examples)
+
+    # X transpose
+    X_col = [dataset.values[i] for i in idx_i]  # vertical columns of X
+
+    # add dummy
+    ones = [1 for _ in range(len(examples))]
+    X_col = [ones] + X_col
+
+    # initialize random weights
+    num_weights = len(idx_i) + 1
+    w = random_weights(min_value=-0.5, max_value=0.5, num_weights=num_weights)
+
+    for epoch in range(epochs):
+        err = []
+        # pass over all examples
+        for example in examples:
+            x = [1] + example
+            y = dotproduct(w, x)
+            t = example[idx_t]
+            err.append(t - y)
+
+        # update weights
+        for i in range(len(w)):
+            w[i] = w[i] + learning_rate * (dotproduct(err, X_col[i]) / num_examples)
 
+    def predict(example):
+        x = [1] + example
+        return dotproduct(w, x)
 
-def NeuralNetLearner(dataset, hidden_layer_sizes=[3], learning_rate=0.01, epochs=100, activation=sigmoid):
-    """Layered feed-forward network.
+    return predict
+
+
+def LogisticLinearLeaner(dataset, learning_rate=0.01, epochs=100):
+    """
+    [Section 18.6.4]
+    Linear classifier with logistic regression.
+    """
+    idx_i = dataset.inputs
+    idx_t = dataset.target
+    examples = dataset.examples
+    num_examples = len(examples)
+
+    # X transpose
+    X_col = [dataset.values[i] for i in idx_i]  # vertical columns of X
+
+    # add dummy
+    ones = [1 for _ in range(len(examples))]
+    X_col = [ones] + X_col
+
+    # initialize random weights
+    num_weights = len(idx_i) + 1
+    w = random_weights(min_value=-0.5, max_value=0.5, num_weights=num_weights)
+
+    for epoch in range(epochs):
+        err = []
+        h = []
+        # pass over all examples
+        for example in examples:
+            x = [1] + example
+            y = sigmoid(dotproduct(w, x))
+            h.append(sigmoid_derivative(y))
+            t = example[idx_t]
+            err.append(t - y)
+
+        # update weights
+        for i in range(len(w)):
+            buffer = [x * y for x, y in zip(err, h)]
+            w[i] = w[i] + learning_rate * (dotproduct(buffer, X_col[i]) / num_examples)
+
+    def predict(example):
+        x = [1] + example
+        return sigmoid(dotproduct(w, x))
+
+    return predict
+
+
+def NeuralNetLearner(dataset, hidden_layer_sizes=None, learning_rate=0.01, epochs=100, activation=sigmoid):
+    """
+    Layered feed-forward network.
     hidden_layer_sizes: List of number of hidden units per hidden layer
     learning_rate: Learning rate of gradient descent
     epochs: Number of passes over the dataset
     """
 
+    if hidden_layer_sizes is None:
+        hidden_layer_sizes = [3]
     i_units = len(dataset.inputs)
     o_units = len(dataset.values[dataset.target])
 
@@ -653,21 +613,21 @@ def NeuralNetLearner(dataset, hidden_layer_sizes=[3], learning_rate=0.01, epochs
     learned_net = BackPropagationLearner(dataset, raw_net, learning_rate, epochs, activation)
 
     def predict(example):
-        # Input nodes
+        # input nodes
         i_nodes = learned_net[0]
 
-        # Activate input layer
+        # activate input layer
         for v, n in zip(example, i_nodes):
             n.value = v
 
-        # Forward pass
+        # forward pass
         for layer in learned_net[1:]:
             for node in layer:
                 inc = [n.value for n in node.inputs]
                 in_val = dotproduct(inc, node.weights)
                 node.value = node.activation(in_val)
 
-        # Hypothesis
+        # hypothesis
         o_nodes = learned_net[-1]
         prediction = find_max_node(o_nodes)
         return prediction
@@ -675,24 +635,20 @@ def predict(example):
     return predict
 
 
-def random_weights(min_value, max_value, num_weights):
-    return [random.uniform(min_value, max_value) for _ in range(num_weights)]
-
-
 def BackPropagationLearner(dataset, net, learning_rate, epochs, activation=sigmoid):
-    """[Figure 18.23] The back-propagation algorithm for multilayer networks"""
-    # Initialise weights
+    """
+    [Figure 18.23]
+    The back-propagation algorithm for multilayer networks.
+    """
+    # initialise weights
     for layer in net:
         for node in layer:
-            node.weights = random_weights(min_value=-0.5, max_value=0.5,
-                                          num_weights=len(node.weights))
+            node.weights = random_weights(min_value=-0.5, max_value=0.5, num_weights=len(node.weights))
 
     examples = dataset.examples
-    '''
-    As of now dataset.target gives an int instead of list,
-    Changing dataset class will have effect on all the learners.
-    Will be taken care of later.
-    '''
+    # As of now dataset.target gives an int instead of list,
+    # Changing dataset class will have effect on all the learners.
+    # Will be taken care of later.
     o_nodes = net[-1]
     i_nodes = net[0]
     o_units = len(o_nodes)
@@ -703,31 +659,31 @@ def BackPropagationLearner(dataset, net, learning_rate, epochs, activation=sigmo
     inputs, targets = init_examples(examples, idx_i, idx_t, o_units)
 
     for epoch in range(epochs):
-        # Iterate over each example
+        # iterate over each example
         for e in range(len(examples)):
             i_val = inputs[e]
             t_val = targets[e]
 
-            # Activate input layer
+            # activate input layer
             for v, n in zip(i_val, i_nodes):
                 n.value = v
 
-            # Forward pass
+            # forward pass
             for layer in net[1:]:
                 for node in layer:
                     inc = [n.value for n in node.inputs]
                     in_val = dotproduct(inc, node.weights)
                     node.value = node.activation(in_val)
 
-            # Initialize delta
+            # initialize delta
             delta = [[] for _ in range(n_layers)]
 
-            # Compute outer layer delta
+            # compute outer layer delta
 
-            # Error for the MSE cost function
+            # error for the MSE cost function
             err = [t_val[i] - o_nodes[i].value for i in range(o_units)]
 
-            # Calculate delta at output
+            # calculate delta at output
             if node.activation == sigmoid:
                 delta[-1] = [sigmoid_derivative(o_nodes[i].value) * err[i] for i in range(o_units)]
             elif node.activation == relu:
@@ -739,7 +695,7 @@ def BackPropagationLearner(dataset, net, learning_rate, epochs, activation=sigmo
             else:
                 delta[-1] = [leaky_relu_derivative(o_nodes[i].value) * err[i] for i in range(o_units)]
 
-            # Backward pass
+            # backward pass
             h_layers = n_layers - 2
             for i in range(h_layers, 0, -1):
                 layer = net[i]
@@ -765,7 +721,7 @@ def BackPropagationLearner(dataset, net, learning_rate, epochs, activation=sigmo
                     delta[i] = [leaky_relu_derivative(layer[j].value) * dotproduct(w[j], delta[i + 1])
                                 for j in range(h_units)]
 
-            #  Update weights
+            # update weights
             for i in range(1, n_layers):
                 layer = net[i]
                 inc = [node.value for node in net[i - 1]]
@@ -788,19 +744,20 @@ def PerceptronLearner(dataset, learning_rate=0.01, epochs=100):
     def predict(example):
         o_nodes = learned_net[1]
 
-        # Forward pass
+        # forward pass
         for node in o_nodes:
             in_val = dotproduct(example, node.weights)
             node.value = node.activation(in_val)
 
-        # Hypothesis
+        # hypothesis
         return find_max_node(o_nodes)
 
     return predict
 
 
 class NNUnit:
-    """Single Unit of Multiple Layer Neural Network
+    """
+    Single Unit of Multiple Layer Neural Network
     inputs: Incoming connections
     weights: Weights to incoming connections
     """
@@ -813,17 +770,18 @@ def __init__(self, activation=sigmoid, weights=None, inputs=None):
 
 
 def network(input_units, hidden_layer_sizes, output_units, activation=sigmoid):
-    """Create Directed Acyclic Network of given number layers.
+    """
+    Create Directed Acyclic Network of given number layers.
     hidden_layers_sizes : List number of neuron units in each hidden layer
     excluding input and output layers
     """
     layers_sizes = [input_units] + hidden_layer_sizes + [output_units]
 
-    net = [[NNUnit(activation) for n in range(size)]
+    net = [[NNUnit(activation) for _ in range(size)]
            for size in layers_sizes]
     n_layers = len(net)
 
-    # Make Connection
+    # make connection
     for i in range(1, n_layers):
         for n in net[i]:
             for k in net[i - 1]:
@@ -836,16 +794,16 @@ def init_examples(examples, idx_i, idx_t, o_units):
     inputs, targets = {}, {}
 
     for i, e in enumerate(examples):
-        # Input values of e
+        # input values of e
         inputs[i] = [e[i] for i in idx_i]
 
         if o_units > 1:
-            # One-Hot representation of e's target
+            # one-hot representation of e's target
             t = [0 for i in range(o_units)]
             t[e[idx_t]] = 1
             targets[i] = t
         else:
-            # Target value of e
+            # target value of e
             targets[i] = [e[idx_t]]
 
     return inputs, targets
@@ -855,50 +813,6 @@ def find_max_node(nodes):
     return nodes.index(argmax(nodes, key=lambda node: node.value))
 
 
-# ______________________________________________________________________________
-
-
-def LinearLearner(dataset, learning_rate=0.01, epochs=100):
-    """Define with learner = LinearLearner(data); infer with learner(x)."""
-    idx_i = dataset.inputs
-    idx_t = dataset.target  # As of now, dataset.target gives only one index.
-    examples = dataset.examples
-    num_examples = len(examples)
-
-    # X transpose
-    X_col = [dataset.values[i] for i in idx_i]  # vertical columns of X
-
-    # Add dummy
-    ones = [1 for _ in range(len(examples))]
-    X_col = [ones] + X_col
-
-    # Initialize random weights
-    num_weights = len(idx_i) + 1
-    w = random_weights(min_value=-0.5, max_value=0.5, num_weights=num_weights)
-
-    for epoch in range(epochs):
-        err = []
-        # Pass over all examples
-        for example in examples:
-            x = [1] + example
-            y = dotproduct(w, x)
-            t = example[idx_t]
-            err.append(t - y)
-
-        # update weights
-        for i in range(len(w)):
-            w[i] = w[i] + learning_rate * (dotproduct(err, X_col[i]) / num_examples)
-
-    def predict(example):
-        x = [1] + example
-        return dotproduct(w, x)
-
-    return predict
-
-
-# ______________________________________________________________________________
-
-
 def EnsembleLearner(learners):
     """Given a list of learning algorithms, have them vote."""
 
@@ -913,48 +827,40 @@ def predict(example):
     return train
 
 
-# ______________________________________________________________________________
-
-
-def AdaBoost(L, K):
+def ada_boost(dataset, L, K):
     """[Figure 18.34]"""
 
-    def train(dataset):
-        examples, target = dataset.examples, dataset.target
-        N = len(examples)
-        epsilon = 1 / (2 * N)
-        w = [1 / N] * N
-        h, z = [], []
-        for k in range(K):
-            h_k = L(dataset, w)
-            h.append(h_k)
-            error = sum(weight for example, weight in zip(examples, w)
-                        if example[target] != h_k(example))
-
-            # Avoid divide-by-0 from either 0% or 100% error rates:
-            error = clip(error, epsilon, 1 - epsilon)
-            for j, example in enumerate(examples):
-                if example[target] == h_k(example):
-                    w[j] *= error / (1 - error)
-            w = normalize(w)
-            z.append(math.log((1 - error) / error))
-        return WeightedMajority(h, z)
-
-    return train
-
-
-def WeightedMajority(predictors, weights):
+    examples, target = dataset.examples, dataset.target
+    N = len(examples)
+    epsilon = 1 / (2 * N)
+    w = [1 / N] * N
+    h, z = [], []
+    for k in range(K):
+        h_k = L(dataset, w)
+        h.append(h_k)
+        error = sum(weight for example, weight in zip(examples, w) if example[target] != h_k(example))
+        # avoid divide-by-0 from either 0% or 100% error rates
+        error = clip(error, epsilon, 1 - epsilon)
+        for j, example in enumerate(examples):
+            if example[target] == h_k(example):
+                w[j] *= error / (1 - error)
+        w = normalize(w)
+        z.append(math.log((1 - error) / error))
+    return weighted_majority(h, z)
+
+
+def weighted_majority(predictors, weights):
     """Return a predictor that takes a weighted vote."""
 
     def predict(example):
-        return weighted_mode((predictor(example) for predictor in predictors),
-                             weights)
+        return weighted_mode((predictor(example) for predictor in predictors), weights)
 
     return predict
 
 
 def weighted_mode(values, weights):
-    """Return the value with the greatest total weight.
+    """
+    Return the value with the greatest total weight.
     >>> weighted_mode('abbaa', [1, 2, 3, 1, 2])
     'b'
     """
@@ -964,13 +870,36 @@ def weighted_mode(values, weights):
     return max(totals, key=totals.__getitem__)
 
 
-# _____________________________________________________________________________
-# Adapting an unweighted learner for AdaBoost
+def RandomForest(dataset, n=5):
+    """An ensemble of Decision Trees trained using bagging and feature bagging."""
+
+    def data_bagging(dataset, m=0):
+        """Sample m examples with replacement"""
+        n = len(dataset.examples)
+        return weighted_sample_with_replacement(m or n, dataset.examples, [1] * n)
+
+    def feature_bagging(dataset, p=0.7):
+        """Feature bagging with probability p to retain an attribute"""
+        inputs = [i for i in dataset.inputs if probability(p)]
+        return inputs or dataset.inputs
+
+    def predict(example):
+        print([predictor(example) for predictor in predictors])
+        return mode(predictor(example) for predictor in predictors)
+
+    predictors = [DecisionTreeLearner(DataSet(examples=data_bagging(dataset), attrs=dataset.attrs,
+                                              attr_names=dataset.attr_names, target=dataset.target,
+                                              inputs=feature_bagging(dataset))) for _ in range(n)]
+
+    return predict
 
 
 def WeightedLearner(unweighted_learner):
-    """Given a learner that takes just an unweighted dataset, return
-    one that takes also a weight for each example. [p. 749 footnote 14]"""
+    """
+    [Page 749 footnote 14]
+    Given a learner that takes just an unweighted dataset, return
+    one that takes also a weight for each example.
+    """
 
     def train(dataset, weights):
         return unweighted_learner(replicated_dataset(dataset, weights))
@@ -987,7 +916,8 @@ def replicated_dataset(dataset, weights, n=None):
 
 
 def weighted_replicate(seq, weights, n):
-    """Return n selections from seq, with the count of each element of
+    """
+    Return n selections from seq, with the count of each element of
     seq proportional to the corresponding weight (filling in fractions
     randomly).
     >>> weighted_replicate('ABC', [1, 2, 1], 4)
@@ -1001,180 +931,39 @@ def weighted_replicate(seq, weights, n):
             weighted_sample_with_replacement(n - sum(wholes), seq, fractions))
 
 
-def flatten(seqs): return sum(seqs, [])
-
-
-# _____________________________________________________________________________
-# Functions for testing learners on examples
+def flatten(seqs):
+    return sum(seqs, [])
 
 
-def err_ratio(predict, dataset, examples=None, verbose=0):
-    """Return the proportion of the examples that are NOT correctly predicted.
-    verbose - 0: No output; 1: Output wrong; 2 (or greater): Output correct"""
-    examples = examples or dataset.examples
-    if len(examples) == 0:
-        return 0.0
-    right = 0
-    for example in examples:
-        desired = example[dataset.target]
-        output = predict(dataset.sanitize(example))
-        if output == desired:
-            right += 1
-            if verbose >= 2:
-                print('   OK: got {} for {}'.format(desired, example))
-        elif verbose:
-            print('WRONG: got {}, expected {} for {}'.format(
-                output, desired, example))
-    return 1 - (right / len(examples))
-
-
-def grade_learner(predict, tests):
-    """Grades the given learner based on how many tests it passes.
-    tests is a list with each element in the form: (values, output)."""
-    return mean(int(predict(X) == y) for X, y in tests)
-
-
-def train_test_split(dataset, start=None, end=None, test_split=None):
-    """If you are giving 'start' and 'end' as parameters,
-    then it will return the testing set from index 'start' to 'end'
-    and the rest for training.
-    If you give 'test_split' as a parameter then it will return
-    test_split * 100% as the testing set and the rest as
-    training set.
-    """
-    examples = dataset.examples
-    if test_split == None:
-        train = examples[:start] + examples[end:]
-        val = examples[start:end]
-    else:
-        total_size = len(examples)
-        val_size = int(total_size * test_split)
-        train_size = total_size - val_size
-        train = examples[:train_size]
-        val = examples[train_size:total_size]
-
-    return train, val
-
-
-def cross_validation(learner, size, dataset, k=10, trials=1):
-    """Do k-fold cross_validate and return their mean.
-    That is, keep out 1/k of the examples for testing on each of k runs.
-    Shuffle the examples first; if trials>1, average over several shuffles.
-    Returns Training error, Validation error"""
-    k = k or len(dataset.examples)
-    if trials > 1:
-        trial_errT = 0
-        trial_errV = 0
-        for t in range(trials):
-            errT, errV = cross_validation(learner, size, dataset, k=10, trials=1)
-            trial_errT += errT
-            trial_errV += errV
-        return trial_errT / trials, trial_errV / trials
-    else:
-        fold_errT = 0
-        fold_errV = 0
-        n = len(dataset.examples)
-        examples = dataset.examples
-        random.shuffle(dataset.examples)
-        for fold in range(k):
-            train_data, val_data = train_test_split(dataset, fold * (n / k), (fold + 1) * (n / k))
-            dataset.examples = train_data
-            h = learner(dataset, size)
-            fold_errT += err_ratio(h, dataset, train_data)
-            fold_errV += err_ratio(h, dataset, val_data)
-
-            # Reverting back to original once test is completed
-            dataset.examples = examples
-        return fold_errT / k, fold_errV / k
-
-
-# TODO: The function cross_validation_wrapper needs to be fixed (the while loop runs forever!)
-def cross_validation_wrapper(learner, dataset, k=10, trials=1):
-    """[Fig 18.8]
-    Return the optimal value of size having minimum error
-    on validation set.
-    err_train: A training error array, indexed by size
-    err_val: A validation error array, indexed by size
-    """
-    err_val = []
-    err_train = []
-    size = 1
-
-    while True:
-        errT, errV = cross_validation(learner, size, dataset, k)
-        # Check for convergence provided err_val is not empty
-        if err_train and isclose(err_train[-1], errT, rel_tol=1e-6):
-            best_size = 0
-            min_val = math.inf
-
-            i = 0
-            while i < size:
-                if err_val[i] < min_val:
-                    min_val = err_val[i]
-                    best_size = i
-                i += 1
-        err_val.append(errV)
-        err_train.append(errT)
-        print(err_val)
-        size += 1
-
-
-def leave_one_out(learner, dataset, size=None):
-    """Leave one out cross-validation over the dataset."""
-    return cross_validation(learner, size, dataset, k=len(dataset.examples))
-
-
-# TODO learning_curve needs to be fixed
-def learning_curve(learner, dataset, trials=10, sizes=None):
-    if sizes is None:
-        sizes = list(range(2, len(dataset.examples) - 10, 2))
-
-    def score(learner, size):
-        random.shuffle(dataset.examples)
-        return train_test_split(learner, dataset, 0, size)
-
-    return [(size, mean([score(learner, size) for t in range(trials)]))
-            for size in sizes]
-
-
-# ______________________________________________________________________________
-# The rest of this file gives datasets for machine learning problems.
-
-
-orings = DataSet(name='orings', target='Distressed',
-                 attrnames="Rings Distressed Temp Pressure Flightnum")
+orings = DataSet(name='orings', target='Distressed', attr_names='Rings Distressed Temp Pressure Flightnum')
 
 zoo = DataSet(name='zoo', target='type', exclude=['name'],
-              attrnames="name hair feathers eggs milk airborne aquatic " +
-                        "predator toothed backbone breathes venomous fins legs tail " +
-                        "domestic catsize type")
+              attr_names='name hair feathers eggs milk airborne aquatic predator toothed backbone '
+                         'breathes venomous fins legs tail domestic catsize type')
 
-iris = DataSet(name="iris", target="class",
-               attrnames="sepal-len sepal-width petal-len petal-width class")
-
-
-# ______________________________________________________________________________
-# The Restaurant example from [Figure 18.2]
+iris = DataSet(name='iris', target='class', attr_names='sepal-len sepal-width petal-len petal-width class')
 
 
 def RestaurantDataSet(examples=None):
-    """Build a DataSet of Restaurant waiting examples. [Figure 18.3]"""
+    """
+    [Figure 18.3]
+    Build a DataSet of Restaurant waiting examples.
+    """
     return DataSet(name='restaurant', target='Wait', examples=examples,
-                   attrnames='Alternate Bar Fri/Sat Hungry Patrons Price ' +
-                             'Raining Reservation Type WaitEstimate Wait')
+                   attr_names='Alternate Bar Fri/Sat Hungry Patrons Price Raining Reservation Type WaitEstimate Wait')
 
 
 restaurant = RestaurantDataSet()
 
 
-def T(attrname, branches):
-    branches = {value: (child if isinstance(child, DecisionFork)
-                        else DecisionLeaf(child))
+def T(attr_name, branches):
+    branches = {value: (child if isinstance(child, DecisionFork) else DecisionLeaf(child))
                 for value, child in branches.items()}
-    return DecisionFork(restaurant.attrnum(attrname), attrname, print, branches)
+    return DecisionFork(restaurant.attr_num(attr_name), attr_name, print, branches)
 
 
-""" [Figure 18.2]
+""" 
+[Figure 18.2]
 A decision tree for deciding whether to wait for a table at a hotel.
 """
 
@@ -1187,8 +976,7 @@ def T(attrname, branches):
                                                           {'Yes': 'Yes',
                                                            'No': T('Bar', {'No': 'No',
                                                                            'Yes': 'Yes'})}),
-                                                  'Yes': T('Fri/Sat', {'No': 'No', 'Yes': 'Yes'})}
-                                                 ),
+                                                  'Yes': T('Fri/Sat', {'No': 'No', 'Yes': 'Yes'})}),
                                       '10-30': T('Hungry',
                                                  {'No': 'Yes',
                                                   'Yes': T('Alternate',
@@ -1206,30 +994,30 @@ def gen():
         example[restaurant.target] = waiting_decision_tree(example)
         return example
 
-    return RestaurantDataSet([gen() for i in range(n)])
-
-
-# ______________________________________________________________________________
-# Artificial, generated datasets.
+    return RestaurantDataSet([gen() for _ in range(n)])
 
 
 def Majority(k, n):
-    """Return a DataSet with n k-bit examples of the majority problem:
-    k random bits followed by a 1 if more than half the bits are 1, else 0."""
+    """
+    Return a DataSet with n k-bit examples of the majority problem:
+    k random bits followed by a 1 if more than half the bits are 1, else 0.
+    """
     examples = []
     for i in range(n):
-        bits = [random.choice([0, 1]) for i in range(k)]
+        bits = [random.choice([0, 1]) for _ in range(k)]
         bits.append(int(sum(bits) > k / 2))
         examples.append(bits)
-    return DataSet(name="majority", examples=examples)
+    return DataSet(name='majority', examples=examples)
 
 
-def Parity(k, n, name="parity"):
-    """Return a DataSet with n k-bit examples of the parity problem:
-    k random bits followed by a 1 if an odd number of bits are 1, else 0."""
+def Parity(k, n, name='parity'):
+    """
+    Return a DataSet with n k-bit examples of the parity problem:
+    k random bits followed by a 1 if an odd number of bits are 1, else 0.
+    """
     examples = []
     for i in range(n):
-        bits = [random.choice([0, 1]) for i in range(k)]
+        bits = [random.choice([0, 1]) for _ in range(k)]
         bits.append(sum(bits) % 2)
         examples.append(bits)
     return DataSet(name=name, examples=examples)
@@ -1237,31 +1025,29 @@ def Parity(k, n, name="parity"):
 
 def Xor(n):
     """Return a DataSet with n examples of 2-input xor."""
-    return Parity(2, n, name="xor")
+    return Parity(2, n, name='xor')
 
 
 def ContinuousXor(n):
     """2 inputs are chosen uniformly from (0.0 .. 2.0]; output is xor of ints."""
     examples = []
     for i in range(n):
-        x, y = [random.uniform(0.0, 2.0) for i in '12']
-        examples.append([x, y, int(x) != int(y)])
-    return DataSet(name="continuous xor", examples=examples)
+        x, y = [random.uniform(0.0, 2.0) for _ in '12']
+        examples.append([x, y, x != y])
+    return DataSet(name='continuous xor', examples=examples)
 
 
-# ______________________________________________________________________________
+def compare(algorithms=None, datasets=None, k=10, trials=1):
+    """
+    Compare various learners on various datasets using cross-validation.
+    Print results as a table.
+    """
+    # default list of algorithms
+    algorithms = algorithms or [PluralityLearner, NaiveBayesLearner, NearestNeighborLearner, DecisionTreeLearner]
 
+    # default list of datasets
+    datasets = datasets or [iris, orings, zoo, restaurant, SyntheticRestaurant(20),
+                            Majority(7, 100), Parity(7, 100), Xor(100)]
 
-def compare(algorithms=None, datasets=None, k=10, trials=1):
-    """Compare various learners on various datasets using cross-validation.
-    Print results as a table."""
-    algorithms = algorithms or [PluralityLearner, NaiveBayesLearner,  # default list
-                                NearestNeighborLearner, DecisionTreeLearner]  # of algorithms
-
-    datasets = datasets or [iris, orings, zoo, restaurant, SyntheticRestaurant(20),  # default list
-                            Majority(7, 100), Parity(7, 100), Xor(100)]  # of datasets
-
-    print_table([[a.__name__.replace('Learner', '')] +
-                 [cross_validation(a, d, k, trials) for d in datasets]
-                 for a in algorithms],
-                header=[''] + [d.name[0:7] for d in datasets], numfmt='%.2f')
+    print_table([[a.__name__.replace('Learner', '')] + [cross_validation(a, d, k=k, trials=trials) for d in datasets]
+                 for a in algorithms], header=[''] + [d.name[0:7] for d in datasets], numfmt='%.2f')
diff --git a/learning4e.py b/learning4e.py
index c8bdd44f2..5cf63dda4 100644
--- a/learning4e.py
+++ b/learning4e.py
@@ -1,3 +1,5 @@
+"""Learning from examples. (Chapters 18)"""
+
 import copy
 import heapq
 import math
@@ -5,49 +7,46 @@
 from collections import defaultdict
 from statistics import mean, stdev
 
-from utils4e import (
-    removeall, unique, mode, argmax_random_tie, isclose, dotproduct, weighted_sample_with_replacement,
-    num_or_str, normalize, clip, print_table, open_data, probability, random_weights,
-    mean_boolean_error)
-
-
-# Learn to estimate functions from examples. (Chapters 18)
-# ______________________________________________________________________________
-# 18.2 Supervised learning.
-# define supervised learning dataset and utility functions/
+from probabilistic_learning import NaiveBayesLearner
+from utils import sigmoid, sigmoid_derivative
+from utils4e import (remove_all, unique, mode, argmax_random_tie, isclose, dotproduct, weighted_sample_with_replacement,
+                     num_or_str, normalize, clip, print_table, open_data, probability, random_weights,
+                     mean_boolean_error)
 
 
 class DataSet:
-    """A data set for a machine learning problem. It has the following fields:
+    """
+    A data set for a machine learning problem. It has the following fields:
 
     d.examples   A list of examples. Each one is a list of attribute values.
     d.attrs      A list of integers to index into an example, so example[attr]
                  gives a value. Normally the same as range(len(d.examples[0])).
-    d.attrnames  Optional list of mnemonic names for corresponding attrs.
+    d.attr_names Optional list of mnemonic names for corresponding attrs.
     d.target     The attribute that a learning algorithm will try to predict.
                  By default the final attribute.
     d.inputs     The list of attrs without the target.
     d.values     A list of lists: each sublist is the set of possible
                  values for the corresponding attribute. If initially None,
-                 it is computed from the known examples by self.setproblem.
+                 it is computed from the known examples by self.set_problem.
                  If not None, an erroneous value raises ValueError.
-    d.distance   A function from a pair of examples to a nonnegative number.
+    d.distance   A function from a pair of examples to a non-negative number.
                  Should be symmetric, etc. Defaults to mean_boolean_error
                  since that can handle any field types.
     d.name       Name of the data set (for output display only).
     d.source     URL or other source where the data came from.
     d.exclude    A list of attribute indexes to exclude from d.inputs. Elements
-                 of this list can either be integers (attrs) or attrnames.
+                 of this list can either be integers (attrs) or attr_names.
 
     Normally, you call the constructor and you're done; then you just
-    access fields like d.examples and d.target and d.inputs."""
+    access fields like d.examples and d.target and d.inputs.
+    """
 
-    def __init__(self, examples=None, attrs=None, attrnames=None, target=-1,
-                 inputs=None, values=None, distance=mean_boolean_error,
-                 name='', source='', exclude=()):
-        """Accepts any of DataSet's fields. Examples can also be a
+    def __init__(self, examples=None, attrs=None, attr_names=None, target=-1, inputs=None,
+                 values=None, distance=mean_boolean_error, name='', source='', exclude=()):
+        """
+        Accepts any of DataSet's fields. Examples can also be a
         string or file from which to parse examples using parse_csv.
-        Optional parameter: exclude, as documented in .setproblem().
+        Optional parameter: exclude, as documented in .set_problem().
         >>> DataSet(examples='1, 2, 3')
         <DataSet(): 1 examples, 3 attributes>
         """
@@ -57,7 +56,7 @@ def __init__(self, examples=None, attrs=None, attrnames=None, target=-1,
         self.distance = distance
         self.got_values_flag = bool(values)
 
-        # Initialize .examples from string or list or data directory
+        # initialize .examples from string or list or data directory
         if isinstance(examples, str):
             self.examples = parse_csv(examples)
         elif examples is None:
@@ -65,39 +64,40 @@ def __init__(self, examples=None, attrs=None, attrnames=None, target=-1,
         else:
             self.examples = examples
 
-        # Attrs are the indices of examples, unless otherwise stated.
+        # attrs are the indices of examples, unless otherwise stated.
         if self.examples is not None and attrs is None:
             attrs = list(range(len(self.examples[0])))
 
         self.attrs = attrs
 
-        # Initialize .attrnames from string, list, or by default
-        if isinstance(attrnames, str):
-            self.attrnames = attrnames.split()
+        # initialize .attr_names from string, list, or by default
+        if isinstance(attr_names, str):
+            self.attr_names = attr_names.split()
         else:
-            self.attrnames = attrnames or attrs
-        self.setproblem(target, inputs=inputs, exclude=exclude)
+            self.attr_names = attr_names or attrs
+        self.set_problem(target, inputs=inputs, exclude=exclude)
 
-    def setproblem(self, target, inputs=None, exclude=()):
-        """Set (or change) the target and/or inputs.
+    def set_problem(self, target, inputs=None, exclude=()):
+        """
+        Set (or change) the target and/or inputs.
         This way, one DataSet can be used multiple ways. inputs, if specified,
         is a list of attributes, or specify exclude as a list of attributes
-        to not use in inputs. Attributes can be -n .. n, or an attrname.
-        Also computes the list of possible values, if that wasn't done yet."""
-        self.target = self.attrnum(target)
-        exclude = list(map(self.attrnum, exclude))
+        to not use in inputs. Attributes can be -n .. n, or an attr_name.
+        Also computes the list of possible values, if that wasn't done yet.
+        """
+        self.target = self.attr_num(target)
+        exclude = list(map(self.attr_num, exclude))
         if inputs:
-            self.inputs = removeall(self.target, inputs)
+            self.inputs = remove_all(self.target, inputs)
         else:
-            self.inputs = [a for a in self.attrs
-                           if a != self.target and a not in exclude]
+            self.inputs = [a for a in self.attrs if a != self.target and a not in exclude]
         if not self.values:
             self.update_values()
         self.check_me()
 
     def check_me(self):
         """Check that my fields make sense."""
-        assert len(self.attrnames) == len(self.attrs)
+        assert len(self.attr_names) == len(self.attrs)
         assert self.target in self.attrs
         assert self.target not in self.inputs
         assert set(self.inputs).issubset(set(self.attrs))
@@ -116,12 +116,12 @@ def check_example(self, example):
             for a in self.attrs:
                 if example[a] not in self.values[a]:
                     raise ValueError('Bad value {} for attribute {} in {}'
-                                     .format(example[a], self.attrnames[a], example))
+                                     .format(example[a], self.attr_names[a], example))
 
-    def attrnum(self, attr):
+    def attr_num(self, attr):
         """Returns the number used for attr, which can be a name, or -n .. n-1."""
         if isinstance(attr, str):
-            return self.attrnames.index(attr)
+            return self.attr_names.index(attr)
         elif attr < 0:
             return len(self.attrs) + attr
         else:
@@ -132,13 +132,12 @@ def update_values(self):
 
     def sanitize(self, example):
         """Return a copy of example, with non-input attributes replaced by None."""
-        return [attr_i if i in self.inputs else None
-                for i, attr_i in enumerate(example)]
+        return [attr_i if i in self.inputs else None for i, attr_i in enumerate(example)]
 
     def classes_to_numbers(self, classes=None):
         """Converts class names to numbers."""
         if not classes:
-            # If classes were not given, extract them from values
+            # if classes were not given, extract them from values
             classes = sorted(self.values[self.target])
         for item in self.examples:
             item[self.target] = classes.index(item[self.target])
@@ -154,17 +153,19 @@ def split_values_by_classes(self):
         target_names = self.values[self.target]
 
         for v in self.examples:
-            item = [a for a in v if a not in target_names]  # Remove target from item
-            buckets[v[self.target]].append(item)  # Add item to bucket of its class
+            item = [a for a in v if a not in target_names]  # remove target from item
+            buckets[v[self.target]].append(item)  # add item to bucket of its class
 
         return buckets
 
     def find_means_and_deviations(self):
-        """Finds the means and standard deviations of self.dataset.
-        means     : A dictionary for each class/target. Holds a list of the means
+        """
+        Finds the means and standard deviations of self.dataset.
+        means     : a dictionary for each class/target. Holds a list of the means
                     of the features for the class.
-        deviations: A dictionary for each class/target. Holds a list of the sample
-                    standard deviations of the features for the class."""
+        deviations: a dictionary for each class/target. Holds a list of the sample
+                    standard deviations of the features for the class.
+        """
         target_names = self.values[self.target]
         feature_numbers = len(self.inputs)
 
@@ -174,13 +175,13 @@ def find_means_and_deviations(self):
         deviations = defaultdict(lambda: [0] * feature_numbers)
 
         for t in target_names:
-            # Find all the item feature values for item in class t
-            features = [[] for i in range(feature_numbers)]
+            # find all the item feature values for item in class t
+            features = [[] for _ in range(feature_numbers)]
             for item in item_buckets[t]:
                 for i in range(feature_numbers):
                     features[i].append(item[i])
 
-            # Calculate means and deviations fo the class
+            # calculate means and deviations fo the class
             for i in range(feature_numbers):
                 means[t][i] = mean(features[i])
                 deviations[t][i] = stdev(features[i])
@@ -188,44 +189,177 @@ def find_means_and_deviations(self):
         return means, deviations
 
     def __repr__(self):
-        return '<DataSet({}): {:d} examples, {:d} attributes>'.format(
-            self.name, len(self.examples), len(self.attrs))
-
-
-# ______________________________________________________________________________
+        return '<DataSet({}): {:d} examples, {:d} attributes>'.format(self.name, len(self.examples), len(self.attrs))
 
 
 def parse_csv(input, delim=','):
-    r"""Input is a string consisting of lines, each line has comma-delimited
+    r"""
+    Input is a string consisting of lines, each line has comma-delimited
     fields.  Convert this into a list of lists. Blank lines are skipped.
     Fields that look like numbers are converted to numbers.
     The delim defaults to ',' but '\t' and None are also reasonable values.
     >>> parse_csv('1, 2, 3 \n 0, 2, na')
-    [[1, 2, 3], [0, 2, 'na']]"""
+    [[1, 2, 3], [0, 2, 'na']]
+    """
     lines = [line for line in input.splitlines() if line.strip()]
     return [list(map(num_or_str, line.split(delim))) for line in lines]
 
 
-# ______________________________________________________________________________
-# 18.3 Learning decision trees
+def err_ratio(predict, dataset, examples=None, verbose=0):
+    """
+    Return the proportion of the examples that are NOT correctly predicted.
+    verbose - 0: No output; 1: Output wrong; 2 (or greater): Output correct
+    """
+    examples = examples or dataset.examples
+    if len(examples) == 0:
+        return 0.0
+    right = 0
+    for example in examples:
+        desired = example[dataset.target]
+        output = predict(dataset.sanitize(example))
+        if output == desired:
+            right += 1
+            if verbose >= 2:
+                print('   OK: got {} for {}'.format(desired, example))
+        elif verbose:
+            print('WRONG: got {}, expected {} for {}'.format(output, desired, example))
+    return 1 - (right / len(examples))
+
+
+def grade_learner(predict, tests):
+    """
+    Grades the given learner based on how many tests it passes.
+    tests is a list with each element in the form: (values, output).
+    """
+    return mean(int(predict(X) == y) for X, y in tests)
+
+
+def train_test_split(dataset, start=None, end=None, test_split=None):
+    """
+    If you are giving 'start' and 'end' as parameters,
+    then it will return the testing set from index 'start' to 'end'
+    and the rest for training.
+    If you give 'test_split' as a parameter then it will return
+    test_split * 100% as the testing set and the rest as
+    training set.
+    """
+    examples = dataset.examples
+    if test_split is None:
+        train = examples[:start] + examples[end:]
+        val = examples[start:end]
+    else:
+        total_size = len(examples)
+        val_size = int(total_size * test_split)
+        train_size = total_size - val_size
+        train = examples[:train_size]
+        val = examples[train_size:total_size]
+
+    return train, val
+
+
+def model_selection(learner, dataset, k=10, trials=1):
+    """
+    [Figure 18.8]
+    Return the optimal value of size having minimum error on validation set.
+    err: a validation error array, indexed by size
+    """
+    errs = []
+    size = 1
+    while True:
+        err = cross_validation(learner, dataset, size, k, trials)
+        # check for convergence provided err_val is not empty
+        if err and not isclose(err[-1], err, rel_tol=1e-6):
+            best_size = 0
+            min_val = math.inf
+            i = 0
+            while i < size:
+                if errs[i] < min_val:
+                    min_val = errs[i]
+                    best_size = i
+                i += 1
+            return learner(dataset, best_size)
+        errs.append(err)
+        size += 1
+
+
+def cross_validation(learner, dataset, size=None, k=10, trials=1):
+    """
+    Do k-fold cross_validate and return their mean.
+    That is, keep out 1/k of the examples for testing on each of k runs.
+    Shuffle the examples first; if trials>1, average over several shuffles.
+    Returns Training error
+    """
+    k = k or len(dataset.examples)
+    if trials > 1:
+        trial_errs = 0
+        for t in range(trials):
+            errs = cross_validation(learner, dataset, size, k, trials)
+            trial_errs += errs
+        return trial_errs / trials
+    else:
+        fold_errs = 0
+        n = len(dataset.examples)
+        examples = dataset.examples
+        random.shuffle(dataset.examples)
+        for fold in range(k):
+            train_data, val_data = train_test_split(dataset, fold * (n // k), (fold + 1) * (n // k))
+            dataset.examples = train_data
+            h = learner(dataset, size)
+            fold_errs += err_ratio(h, dataset, train_data)
+            # reverting back to original once test is completed
+            dataset.examples = examples
+        return fold_errs / k
+
+
+def leave_one_out(learner, dataset, size=None):
+    """Leave one out cross-validation over the dataset."""
+    return cross_validation(learner, dataset, size, len(dataset.examples))
+
+
+# TODO learning_curve needs to be fixed
+def learning_curve(learner, dataset, trials=10, sizes=None):
+    if sizes is None:
+        sizes = list(range(2, len(dataset.examples) - 10, 2))
+
+    def score(learner, size):
+        random.shuffle(dataset.examples)
+        return train_test_split(learner, dataset, 0, size)
+
+    return [(size, mean([score(learner, size) for _ in range(trials)])) for size in sizes]
+
+
+def PluralityLearner(dataset):
+    """
+    A very dumb algorithm: always pick the result that was most popular
+    in the training data. Makes a baseline for comparison.
+    """
+    most_popular = mode([e[dataset.target] for e in dataset.examples])
+
+    def predict(example):
+        """Always return same result: the most popular from the training set."""
+        return most_popular
+
+    return predict
 
 
 class DecisionFork:
-    """A fork of a decision tree holds an attribute to test, and a dict
-    of branches, one for each of the attribute's values."""
+    """
+    A fork of a decision tree holds an attribute to test, and a dict
+    of branches, one for each of the attribute's values.
+    """
 
-    def __init__(self, attr, attrname=None, default_child=None, branches=None):
+    def __init__(self, attr, attr_name=None, default_child=None, branches=None):
         """Initialize by saying what attribute this node tests."""
         self.attr = attr
-        self.attrname = attrname or attr
+        self.attr_name = attr_name or attr
         self.default_child = default_child
         self.branches = branches or {}
 
     def __call__(self, example):
         """Given an example, classify it using the attribute and the branches."""
-        attrvalue = example[self.attr]
-        if attrvalue in self.branches:
-            return self.branches[attrvalue](example)
+        attr_val = example[self.attr]
+        if attr_val in self.branches:
+            return self.branches[attr_val](example)
         else:
             # return default class when attribute is unknown
             return self.default_child(example)
@@ -235,16 +369,14 @@ def add(self, val, subtree):
         self.branches[val] = subtree
 
     def display(self, indent=0):
-        name = self.attrname
+        name = self.attr_name
         print('Test', name)
         for (val, subtree) in self.branches.items():
             print(' ' * 4 * indent, name, '=', val, '==>', end=' ')
             subtree.display(indent + 1)
-        print()  # newline
 
     def __repr__(self):
-        return ('DecisionFork({0!r}, {1!r}, {2!r})'
-                .format(self.attr, self.attrname, self.branches))
+        return 'DecisionFork({0!r}, {1!r}, {2!r})'.format(self.attr, self.attr_name, self.branches)
 
 
 class DecisionLeaf:
@@ -256,37 +388,37 @@ def __init__(self, result):
     def __call__(self, example):
         return self.result
 
-    def display(self, indent=0):
+    def display(self):
         print('RESULT =', self.result)
 
     def __repr__(self):
         return repr(self.result)
 
 
-# decision tree learning in Figure 18.5
-
-
 def DecisionTreeLearner(dataset):
+    """[Figure 18.5]"""
+
     target, values = dataset.target, dataset.values
 
     def decision_tree_learning(examples, attrs, parent_examples=()):
         if len(examples) == 0:
             return plurality_value(parent_examples)
-        elif all_same_class(examples):
+        if all_same_class(examples):
             return DecisionLeaf(examples[0][target])
-        elif len(attrs) == 0:
+        if len(attrs) == 0:
             return plurality_value(examples)
-        else:
-            A = choose_attribute(attrs, examples)
-            tree = DecisionFork(A, dataset.attrnames[A], plurality_value(examples))
-            for (v_k, exs) in split_by(A, examples):
-                subtree = decision_tree_learning(exs, removeall(A, attrs), examples)
-                tree.add(v_k, subtree)
-            return tree
+        A = choose_attribute(attrs, examples)
+        tree = DecisionFork(A, dataset.attr_names[A], plurality_value(examples))
+        for (v_k, exs) in split_by(A, examples):
+            subtree = decision_tree_learning(exs, remove_all(A, attrs), examples)
+            tree.add(v_k, subtree)
+        return tree
 
     def plurality_value(examples):
-        """Return the most popular target value for this set of examples.
-        (If target is binary, this is the majority; otherwise plurality.)"""
+        """
+        Return the most popular target value for this set of examples.
+        (If target is binary, this is the majority; otherwise plurality).
+        """
         popular = argmax_random_tie(values[target], key=lambda v: count(target, v, examples))
         return DecisionLeaf(popular)
 
@@ -307,190 +439,31 @@ def information_gain(attr, examples):
         """Return the expected reduction in entropy from splitting by attr."""
 
         def I(examples):
-            return information_content([count(target, v, examples)
-                                        for v in values[target]])
+            return information_content([count(target, v, examples) for v in values[target]])
 
         N = len(examples)
-        remainder = sum((len(examples_i) / N) * I(examples_i)
-                        for (v, examples_i) in split_by(attr, examples))
+        remainder = sum((len(examples_i) / N) * I(examples_i) for (v, examples_i) in split_by(attr, examples))
         return I(examples) - remainder
 
     def split_by(attr, examples):
         """Return a list of (val, examples) pairs for each val of attr."""
-        return [(v, [e for e in examples if e[attr] == v])
-                for v in values[attr]]
+        return [(v, [e for e in examples if e[attr] == v]) for v in values[attr]]
 
     return decision_tree_learning(dataset.examples, dataset.inputs)
 
 
 def information_content(values):
     """Number of bits to represent the probability distribution in values."""
-    probabilities = normalize(removeall(0, values))
+    probabilities = normalize(remove_all(0, values))
     return sum(-p * math.log2(p) for p in probabilities)
 
 
-# ______________________________________________________________________________
-# 18.4 Model selection and optimization
-
-
-def model_selection(learner, dataset, k=10, trials=1):
-    """[Fig 18.8]
-    Return the optimal value of size having minimum error
-    on validation set.
-    err_train: A training error array, indexed by size
-    err_val: A validation error array, indexed by size
+def DecisionListLearner(dataset):
     """
-    errs = []
-    size = 1
-
-    while True:
-        err = cross_validation(learner, size, dataset, k, trials)
-        # Check for convergence provided err_val is not empty
-        if err and not isclose(err[-1], err, rel_tol=1e-6):
-            best_size = 0
-            min_val = math.inf
-
-            i = 0
-            while i < size:
-                if errs[i] < min_val:
-                    min_val = errs[i]
-                    best_size = i
-                i += 1
-            return learner(dataset, best_size)
-        errs.append(err)
-        size += 1
-
-
-def cross_validation(learner, size, dataset, k=10, trials=1):
-    """Do k-fold cross_validate and return their mean.
-    That is, keep out 1/k of the examples for testing on each of k runs.
-    Shuffle the examples first; if trials>1, average over several shuffles.
-    Returns Training error, Validation error"""
-    k = k or len(dataset.examples)
-    if trials > 1:
-        trial_errs = 0
-        for t in range(trials):
-            errs = cross_validation(learner, size, dataset, k=10, trials=1)
-            trial_errs += errs
-        return trial_errs / trials
-    else:
-        fold_errs = 0
-        n = len(dataset.examples)
-        examples = dataset.examples
-        random.shuffle(dataset.examples)
-        for fold in range(k):
-            train_data, val_data = train_test_split(dataset, fold * (n // k), (fold + 1) * (n // k))
-            dataset.examples = train_data
-            h = learner(dataset, size)
-            fold_errs += err_ratio(h, dataset, train_data)
-
-            # Reverting back to original once test is completed
-            dataset.examples = examples
-        return fold_errs / k
-
-
-def cross_validation_nosize(learner, dataset, k=10, trials=1):
-    """Do k-fold cross_validate and return their mean.
-    That is, keep out 1/k of the examples for testing on each of k runs.
-    Shuffle the examples first; if trials>1, average over several shuffles.
-    Returns Training error, Validation error"""
-    k = k or len(dataset.examples)
-    if trials > 1:
-        trial_errs = 0
-        for t in range(trials):
-            errs = cross_validation(learner, dataset, k=10, trials=1)
-            trial_errs += errs
-        return trial_errs / trials
-    else:
-        fold_errs = 0
-        n = len(dataset.examples)
-        examples = dataset.examples
-        random.shuffle(dataset.examples)
-        for fold in range(k):
-            train_data, val_data = train_test_split(dataset, fold * (n // k), (fold + 1) * (n // k))
-            dataset.examples = train_data
-            h = learner(dataset)
-            fold_errs += err_ratio(h, dataset, train_data)
-
-            # Reverting back to original once test is completed
-            dataset.examples = examples
-        return fold_errs / k
-
-
-def err_ratio(predict, dataset, examples=None, verbose=0):
-    """Return the proportion of the examples that are NOT correctly predicted.
-    verbose - 0: No output; 1: Output wrong; 2 (or greater): Output correct"""
-    examples = examples or dataset.examples
-    if len(examples) == 0:
-        return 0.0
-    right = 0
-    for example in examples:
-        desired = example[dataset.target]
-        output = predict(dataset.sanitize(example))
-        if output == desired:
-            right += 1
-            if verbose >= 2:
-                print('   OK: got {} for {}'.format(desired, example))
-        elif verbose:
-            print('WRONG: got {}, expected {} for {}'.format(
-                output, desired, example))
-    return 1 - (right / len(examples))
-
-
-def train_test_split(dataset, start=None, end=None, test_split=None):
-    """If you are giving 'start' and 'end' as parameters,
-    then it will return the testing set from index 'start' to 'end'
-    and the rest for training.
-    If you give 'test_split' as a parameter then it will return
-    test_split * 100% as the testing set and the rest as
-    training set.
+    [Figure 18.11]
+    A decision list implemented as a list of (test, value) pairs.
     """
-    examples = dataset.examples
-    if test_split == None:
-        train = examples[:start] + examples[end:]
-        val = examples[start:end]
-    else:
-        total_size = len(examples)
-        val_size = int(total_size * test_split)
-        train_size = total_size - val_size
-        train = examples[:train_size]
-        val = examples[train_size:total_size]
-
-    return train, val
-
-
-def grade_learner(predict, tests):
-    """Grades the given learner based on how many tests it passes.
-    tests is a list with each element in the form: (values, output)."""
-    return mean(int(predict(X) == y) for X, y in tests)
-
-
-def leave_one_out(learner, dataset, size=None):
-    """Leave one out cross-validation over the dataset."""
-    return cross_validation(learner, size, dataset, k=len(dataset.examples))
 
-
-# TODO learning_curve needs to fixed
-def learning_curve(learner, dataset, trials=10, sizes=None):
-    if sizes is None:
-        sizes = list(range(2, len(dataset.examples) - 10, 2))
-
-    def score(learner, size):
-        random.shuffle(dataset.examples)
-        return train_test_split(learner, dataset, 0, size)
-
-    return [(size, mean([score(learner, size) for t in range(trials)]))
-            for size in sizes]
-
-
-# ______________________________________________________________________________
-# 18.5 The theory Of learning
-
-
-def DecisionListLearner(dataset):
-    """A decision list is implemented as a list of (test, value) pairs.[Figure 18.11]"""
-
-    # TODO: where are the tests from?
     def decision_list_learning(examples):
         if not examples:
             return [(True, False)]
@@ -500,13 +473,14 @@ def decision_list_learning(examples):
         return [(t, o)] + decision_list_learning(examples - examples_t)
 
     def find_examples(examples):
-        """Find a set of examples that all have the same outcome under
-        some test. Return a tuple of the test, outcome, and examples."""
+        """
+        Find a set of examples that all have the same outcome under
+        some test. Return a tuple of the test, outcome, and examples.
+        """
         raise NotImplementedError
 
     def passes(example, test):
         """Does the example pass the test?"""
-        return test.test(example)
         raise NotImplementedError
 
     def predict(example):
@@ -520,36 +494,44 @@ def predict(example):
     return predict
 
 
-# ______________________________________________________________________________
-# 18.6 Linear regression and classification
+def NearestNeighborLearner(dataset, k=1):
+    """k-NearestNeighbor: the k nearest neighbors vote."""
+
+    def predict(example):
+        """Find the k closest items, and have them vote for the best."""
+        best = heapq.nsmallest(k, ((dataset.distance(e, example), e) for e in dataset.examples))
+        return mode(e[dataset.target] for (d, e) in best)
+
+    return predict
 
 
 def LinearLearner(dataset, learning_rate=0.01, epochs=100):
-    """Define with learner = LinearLearner(data); infer with learner(x)."""
+    """
+    [Section 18.6.4]
+    Linear classifier with hard threshold.
+    """
     idx_i = dataset.inputs
-    idx_t = dataset.target  # As of now, dataset.target gives only one index.
+    idx_t = dataset.target
     examples = dataset.examples
     num_examples = len(examples)
 
     # X transpose
     X_col = [dataset.values[i] for i in idx_i]  # vertical columns of X
 
-    # Add dummy
+    # add dummy
     ones = [1 for _ in range(len(examples))]
     X_col = [ones] + X_col
 
-    # Initialize random weights
+    # initialize random weights
     num_weights = len(idx_i) + 1
     w = random_weights(min_value=-0.5, max_value=0.5, num_weights=num_weights)
 
     for epoch in range(epochs):
         err = []
-        # Pass over all examples
+        # pass over all examples
         for example in examples:
             x = [1] + example
             y = dotproduct(w, x)
-            # if threshold:
-            #     y = threshold(y)
             t = example[idx_t]
             err.append(t - y)
 
@@ -565,7 +547,10 @@ def predict(example):
 
 
 def LogisticLinearLeaner(dataset, learning_rate=0.01, epochs=100):
-    """Define logistic regression classifier in 18.6.5"""
+    """
+    [Section 18.6.5]
+    Linear classifier with logistic regression.
+    """
     idx_i = dataset.inputs
     idx_t = dataset.target
     examples = dataset.examples
@@ -574,59 +559,37 @@ def LogisticLinearLeaner(dataset, learning_rate=0.01, epochs=100):
     # X transpose
     X_col = [dataset.values[i] for i in idx_i]  # vertical columns of X
 
-    # Add dummy
+    # add dummy
     ones = [1 for _ in range(len(examples))]
     X_col = [ones] + X_col
 
-    # Initialize random weights
+    # initialize random weights
     num_weights = len(idx_i) + 1
     w = random_weights(min_value=-0.5, max_value=0.5, num_weights=num_weights)
 
     for epoch in range(epochs):
         err = []
         h = []
-        # Pass over all examples
+        # pass over all examples
         for example in examples:
             x = [1] + example
-            y = 1 / (1 + math.exp(-dotproduct(w, x)))
-            h.append(y * (1 - y))
+            y = sigmoid(dotproduct(w, x))
+            h.append(sigmoid_derivative(y))
             t = example[idx_t]
             err.append(t - y)
 
         # update weights
         for i in range(len(w)):
             buffer = [x * y for x, y in zip(err, h)]
-            # w[i] = w[i] + learning_rate * (dotproduct(err, X_col[i]) / num_examples)
             w[i] = w[i] + learning_rate * (dotproduct(buffer, X_col[i]) / num_examples)
 
     def predict(example):
         x = [1] + example
-        return 1 / (1 + math.exp(-dotproduct(w, x)))
-
-    return predict
-
-
-# ______________________________________________________________________________
-# 18.7 Nonparametric models
-
-
-def NearestNeighborLearner(dataset, k=1):
-    """k-NearestNeighbor: the k nearest neighbors vote."""
-
-    def predict(example):
-        """Find the k closest items, and have them vote for the best."""
-        example.pop(dataset.target)
-        best = heapq.nsmallest(k, ((dataset.distance(e, example), e)
-                                   for e in dataset.examples))
-        return mode(e[dataset.target] for (d, e) in best)
+        return sigmoid(dotproduct(w, x))
 
     return predict
 
 
-# ______________________________________________________________________________
-# 18.8 Ensemble learning
-
-
 def EnsembleLearner(learners):
     """Given a list of learning algorithms, have them vote."""
 
@@ -641,6 +604,49 @@ def predict(example):
     return train
 
 
+def ada_boost(dataset, L, K):
+    """[Figure 18.34]"""
+
+    examples, target = dataset.examples, dataset.target
+    N = len(examples)
+    epsilon = 1 / (2 * N)
+    w = [1 / N] * N
+    h, z = [], []
+    for k in range(K):
+        h_k = L(dataset, w)
+        h.append(h_k)
+        error = sum(weight for example, weight in zip(examples, w) if example[target] != h_k(example))
+        # avoid divide-by-0 from either 0% or 100% error rates
+        error = clip(error, epsilon, 1 - epsilon)
+        for j, example in enumerate(examples):
+            if example[target] == h_k(example):
+                w[j] *= error / (1 - error)
+        w = normalize(w)
+        z.append(math.log((1 - error) / error))
+    return weighted_majority(h, z)
+
+
+def weighted_majority(predictors, weights):
+    """Return a predictor that takes a weighted vote."""
+
+    def predict(example):
+        return weighted_mode((predictor(example) for predictor in predictors), weights)
+
+    return predict
+
+
+def weighted_mode(values, weights):
+    """
+    Return the value with the greatest total weight.
+    >>> weighted_mode('abbaa', [1, 2, 3, 1, 2])
+    'b'
+    """
+    totals = defaultdict(int)
+    for v, w in zip(values, weights):
+        totals[v] += w
+    return max(totals, key=totals.__getitem__)
+
+
 def RandomForest(dataset, n=5):
     """An ensemble of Decision Trees trained using bagging and feature bagging."""
 
@@ -658,70 +664,19 @@ def predict(example):
         print([predictor(example) for predictor in predictors])
         return mode(predictor(example) for predictor in predictors)
 
-    predictors = [DecisionTreeLearner(DataSet(examples=data_bagging(dataset),
-                                              attrs=dataset.attrs,
-                                              attrnames=dataset.attrnames,
-                                              target=dataset.target,
+    predictors = [DecisionTreeLearner(DataSet(examples=data_bagging(dataset), attrs=dataset.attrs,
+                                              attr_names=dataset.attr_names, target=dataset.target,
                                               inputs=feature_bagging(dataset))) for _ in range(n)]
 
     return predict
 
 
-def AdaBoost(L, K):
-    """[Figure 18.34]"""
-
-    def train(dataset):
-        examples, target = dataset.examples, dataset.target
-        N = len(examples)
-        epsilon = 1 / (2 * N)
-        w = [1 / N] * N
-        h, z = [], []
-        for k in range(K):
-            h_k = L(dataset, w)
-            h.append(h_k)
-            error = sum(weight for example, weight in zip(examples, w)
-                        if example[target] != h_k(example))
-
-            # Avoid divide-by-0 from either 0% or 100% error rates:
-            error = clip(error, epsilon, 1 - epsilon)
-            for j, example in enumerate(examples):
-                if example[target] == h_k(example):
-                    w[j] *= error / (1 - error)
-            w = normalize(w)
-            z.append(math.log((1 - error) / error))
-        return WeightedMajority(h, z)
-
-    return train
-
-
-def WeightedMajority(predictors, weights):
-    """Return a predictor that takes a weighted vote."""
-
-    def predict(example):
-        return weighted_mode((predictor(example) for predictor in predictors),
-                             weights)
-
-    return predict
-
-
-def weighted_mode(values, weights):
-    """Return the value with the greatest total weight.
-    >>> weighted_mode('abbaa', [1, 2, 3, 1, 2])
-    'b'
-    """
-    totals = defaultdict(int)
-    for v, w in zip(values, weights):
-        totals[v] += w
-    return max(totals, key=totals.__getitem__)
-
-
-# _____________________________________________________________________________
-# Adapting an unweighted learner for AdaBoost
-
-
 def WeightedLearner(unweighted_learner):
-    """Given a learner that takes just an unweighted dataset, return
-    one that takes also a weight for each example. [p. 749 footnote 14]"""
+    """
+    [Page 749 footnote 14]
+    Given a learner that takes just an unweighted dataset, return
+    one that takes also a weight for each example.
+    """
 
     def train(dataset, weights):
         return unweighted_learner(replicated_dataset(dataset, weights))
@@ -738,7 +693,8 @@ def replicated_dataset(dataset, weights, n=None):
 
 
 def weighted_replicate(seq, weights, n):
-    """Return n selections from seq, with the count of each element of
+    """
+    Return n selections from seq, with the count of each element of
     seq proportional to the corresponding weight (filling in fractions
     randomly).
     >>> weighted_replicate('ABC', [1, 2, 1], 4)
@@ -752,48 +708,39 @@ def weighted_replicate(seq, weights, n):
             weighted_sample_with_replacement(n - sum(wholes), seq, fractions))
 
 
-def flatten(seqs): return sum(seqs, [])
-
-
-# _____________________________________________________________________________
-# Functions for testing learners on examples
-# The rest of this file gives datasets for machine learning problems.
+def flatten(seqs):
+    return sum(seqs, [])
 
 
-orings = DataSet(name='orings', target='Distressed',
-                 attrnames="Rings Distressed Temp Pressure Flightnum")
+orings = DataSet(name='orings', target='Distressed', attr_names='Rings Distressed Temp Pressure Flightnum')
 
 zoo = DataSet(name='zoo', target='type', exclude=['name'],
-              attrnames="name hair feathers eggs milk airborne aquatic " +
-                        "predator toothed backbone breathes venomous fins legs tail " +
-                        "domestic catsize type")
-
-iris = DataSet(name="iris", target="class",
-               attrnames="sepal-len sepal-width petal-len petal-width class")
-
+              attr_names='name hair feathers eggs milk airborne aquatic predator toothed backbone '
+                         'breathes venomous fins legs tail domestic catsize type')
 
-# ______________________________________________________________________________
-# The Restaurant example from [Figure 18.2]
+iris = DataSet(name='iris', target='class', attr_names='sepal-len sepal-width petal-len petal-width class')
 
 
 def RestaurantDataSet(examples=None):
-    """Build a DataSet of Restaurant waiting examples. [Figure 18.3]"""
+    """
+    [Figure 18.3]
+    Build a DataSet of Restaurant waiting examples.
+    """
     return DataSet(name='restaurant', target='Wait', examples=examples,
-                   attrnames='Alternate Bar Fri/Sat Hungry Patrons Price ' +
-                             'Raining Reservation Type WaitEstimate Wait')
+                   attr_names='Alternate Bar Fri/Sat Hungry Patrons Price Raining Reservation Type WaitEstimate Wait')
 
 
 restaurant = RestaurantDataSet()
 
 
-def T(attrname, branches):
-    branches = {value: (child if isinstance(child, DecisionFork)
-                        else DecisionLeaf(child))
+def T(attr_name, branches):
+    branches = {value: (child if isinstance(child, DecisionFork) else DecisionLeaf(child))
                 for value, child in branches.items()}
-    return DecisionFork(restaurant.attrnum(attrname), attrname, print, branches)
+    return DecisionFork(restaurant.attr_num(attr_name), attr_name, print, branches)
 
 
-""" [Figure 18.2]
+""" 
+[Figure 18.2]
 A decision tree for deciding whether to wait for a table at a hotel.
 """
 
@@ -806,8 +753,7 @@ def T(attrname, branches):
                                                           {'Yes': 'Yes',
                                                            'No': T('Bar', {'No': 'No',
                                                                            'Yes': 'Yes'})}),
-                                                  'Yes': T('Fri/Sat', {'No': 'No', 'Yes': 'Yes'})}
-                                                 ),
+                                                  'Yes': T('Fri/Sat', {'No': 'No', 'Yes': 'Yes'})}),
                                       '10-30': T('Hungry',
                                                  {'No': 'Yes',
                                                   'Yes': T('Alternate',
@@ -825,30 +771,30 @@ def gen():
         example[restaurant.target] = waiting_decision_tree(example)
         return example
 
-    return RestaurantDataSet([gen() for i in range(n)])
-
-
-# ______________________________________________________________________________
-# Artificial, generated datasets.
+    return RestaurantDataSet([gen() for _ in range(n)])
 
 
 def Majority(k, n):
-    """Return a DataSet with n k-bit examples of the majority problem:
-    k random bits followed by a 1 if more than half the bits are 1, else 0."""
+    """
+    Return a DataSet with n k-bit examples of the majority problem:
+    k random bits followed by a 1 if more than half the bits are 1, else 0.
+    """
     examples = []
     for i in range(n):
-        bits = [random.choice([0, 1]) for i in range(k)]
+        bits = [random.choice([0, 1]) for _ in range(k)]
         bits.append(int(sum(bits) > k / 2))
         examples.append(bits)
-    return DataSet(name="majority", examples=examples)
+    return DataSet(name='majority', examples=examples)
 
 
-def Parity(k, n, name="parity"):
-    """Return a DataSet with n k-bit examples of the parity problem:
-    k random bits followed by a 1 if an odd number of bits are 1, else 0."""
+def Parity(k, n, name='parity'):
+    """
+    Return a DataSet with n k-bit examples of the parity problem:
+    k random bits followed by a 1 if an odd number of bits are 1, else 0.
+    """
     examples = []
     for i in range(n):
-        bits = [random.choice([0, 1]) for i in range(k)]
+        bits = [random.choice([0, 1]) for _ in range(k)]
         bits.append(sum(bits) % 2)
         examples.append(bits)
     return DataSet(name=name, examples=examples)
@@ -856,27 +802,29 @@ def Parity(k, n, name="parity"):
 
 def Xor(n):
     """Return a DataSet with n examples of 2-input xor."""
-    return Parity(2, n, name="xor")
+    return Parity(2, n, name='xor')
 
 
 def ContinuousXor(n):
     """2 inputs are chosen uniformly from (0.0 .. 2.0]; output is xor of ints."""
     examples = []
     for i in range(n):
-        x, y = [random.uniform(0.0, 2.0) for i in '12']
-        examples.append([x, y, int(x) != int(y)])
-    return DataSet(name="continuous xor", examples=examples)
+        x, y = [random.uniform(0.0, 2.0) for _ in '12']
+        examples.append([x, y, x != y])
+    return DataSet(name='continuous xor', examples=examples)
 
 
 def compare(algorithms=None, datasets=None, k=10, trials=1):
-    """Compare various learners on various datasets using cross-validation.
-    Print results as a table."""
-    algorithms = algorithms or [NearestNeighborLearner, DecisionTreeLearner]  # default list of algorithms
+    """
+    Compare various learners on various datasets using cross-validation.
+    Print results as a table.
+    """
+    # default list of algorithms
+    algorithms = algorithms or [PluralityLearner, NaiveBayesLearner, NearestNeighborLearner, DecisionTreeLearner]
 
-    datasets = datasets or [iris, orings, zoo, restaurant, SyntheticRestaurant(20),  # default list
-                            Majority(7, 100), Parity(7, 100), Xor(100)]  # of datasets
+    # default list of datasets
+    datasets = datasets or [iris, orings, zoo, restaurant, SyntheticRestaurant(20),
+                            Majority(7, 100), Parity(7, 100), Xor(100)]
 
-    print_table([[a.__name__.replace('Learner', '')] +
-                 [cross_validation_nosize(a, d, k, trials) for d in datasets]
-                 for a in algorithms],
-                header=[''] + [d.name[0:7] for d in datasets], numfmt='{0:.2f}')
+    print_table([[a.__name__.replace('Learner', '')] + [cross_validation(a, d, k=k, trials=trials) for d in datasets]
+                 for a in algorithms], header=[''] + [d.name[0:7] for d in datasets], numfmt='%.2f')
diff --git a/learning_apps.ipynb b/learning_apps.ipynb
index 6d5a27a45..dd45b11b5 100644
--- a/learning_apps.ipynb
+++ b/learning_apps.ipynb
@@ -16,6 +16,7 @@
    "outputs": [],
    "source": [
     "from learning import *\n",
+    "from probabilistic_learning import *\n",
     "from notebook import *"
    ]
   },
@@ -971,8 +972,17 @@
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
    "version": "3.7.0"
+  },
+  "pycharm": {
+   "stem_cell": {
+    "cell_type": "raw",
+    "source": [],
+    "metadata": {
+     "collapsed": false
+    }
+   }
   }
  },
  "nbformat": 4,
  "nbformat_minor": 2
-}
+}
\ No newline at end of file
diff --git a/logic.py b/logic.py
index 60da6294d..7f4d259dd 100644
--- a/logic.py
+++ b/logic.py
@@ -40,10 +40,8 @@
 from agents import Agent, Glitter, Bump, Stench, Breeze, Scream
 from csp import parse_neighbors, UniversalDict
 from search import astar_search, PlanRoute
-from utils import (
-    removeall, unique, first, argmax, probability,
-    isnumber, issequence, Expr, expr, subexpressions,
-    extend)
+from utils import (remove_all, unique, first, argmax, probability, isnumber,
+                   issequence, Expr, expr, subexpressions, extend)
 
 
 # ______________________________________________________________________________
@@ -508,7 +506,7 @@ def pl_resolve(ci, cj):
     for di in disjuncts(ci):
         for dj in disjuncts(cj):
             if di == ~dj or ~di == dj:
-                clauses.append(associate('|', unique(removeall(di, disjuncts(ci)) + removeall(dj, disjuncts(cj)))))
+                clauses.append(associate('|', unique(remove_all(di, disjuncts(ci)) + remove_all(dj, disjuncts(cj)))))
     return clauses
 
 
@@ -714,13 +712,13 @@ def dpll(clauses, symbols, model, branching_heuristic=no_branching_heuristic):
         return model
     P, value = find_pure_symbol(symbols, unknown_clauses)
     if P:
-        return dpll(clauses, removeall(P, symbols), extend(model, P, value), branching_heuristic)
+        return dpll(clauses, remove_all(P, symbols), extend(model, P, value), branching_heuristic)
     P, value = find_unit_clause(clauses, model)
     if P:
-        return dpll(clauses, removeall(P, symbols), extend(model, P, value), branching_heuristic)
+        return dpll(clauses, remove_all(P, symbols), extend(model, P, value), branching_heuristic)
     P, value = branching_heuristic(symbols, unknown_clauses)
-    return (dpll(clauses, removeall(P, symbols), extend(model, P, value), branching_heuristic) or
-            dpll(clauses, removeall(P, symbols), extend(model, P, not value), branching_heuristic))
+    return (dpll(clauses, remove_all(P, symbols), extend(model, P, value), branching_heuristic) or
+            dpll(clauses, remove_all(P, symbols), extend(model, P, not value), branching_heuristic))
 
 
 def find_pure_symbol(symbols, clauses):
@@ -950,8 +948,8 @@ def pl_binary_resolution(ci, cj):
     for di in disjuncts(ci):
         for dj in disjuncts(cj):
             if di == ~dj or ~di == dj:
-                return pl_binary_resolution(associate('|', removeall(di, disjuncts(ci))),
-                                            associate('|', removeall(dj, disjuncts(cj))))
+                return pl_binary_resolution(associate('|', remove_all(di, disjuncts(ci))),
+                                            associate('|', remove_all(dj, disjuncts(cj))))
     return associate('|', unique(disjuncts(ci) + disjuncts(cj)))
 
 
diff --git a/probabilistic_learning.py b/probabilistic_learning.py
new file mode 100644
index 000000000..4b78ef2d9
--- /dev/null
+++ b/probabilistic_learning.py
@@ -0,0 +1,154 @@
+"""Learning probabilistic models. (Chapters 20)"""
+
+import heapq
+
+from utils import weighted_sampler, argmax, product, gaussian
+
+
+class CountingProbDist:
+    """
+    A probability distribution formed by observing and counting examples.
+    If p is an instance of this class and o is an observed value, then
+    there are 3 main operations:
+    p.add(o) increments the count for observation o by 1.
+    p.sample() returns a random element from the distribution.
+    p[o] returns the probability for o (as in a regular ProbDist).
+    """
+
+    def __init__(self, observations=None, default=0):
+        """
+        Create a distribution, and optionally add in some observations.
+        By default this is an unsmoothed distribution, but saying default=1,
+        for example, gives you add-one smoothing.
+        """
+        if observations is None:
+            observations = []
+        self.dictionary = {}
+        self.n_obs = 0
+        self.default = default
+        self.sampler = None
+
+        for o in observations:
+            self.add(o)
+
+    def add(self, o):
+        """Add an observation o to the distribution."""
+        self.smooth_for(o)
+        self.dictionary[o] += 1
+        self.n_obs += 1
+        self.sampler = None
+
+    def smooth_for(self, o):
+        """
+        Include o among the possible observations, whether or not
+        it's been observed yet.
+        """
+        if o not in self.dictionary:
+            self.dictionary[o] = self.default
+            self.n_obs += self.default
+            self.sampler = None
+
+    def __getitem__(self, item):
+        """Return an estimate of the probability of item."""
+        self.smooth_for(item)
+        return self.dictionary[item] / self.n_obs
+
+    # (top() and sample() are not used in this module, but elsewhere.)
+
+    def top(self, n):
+        """Return (count, obs) tuples for the n most frequent observations."""
+        return heapq.nlargest(n, [(v, k) for (k, v) in self.dictionary.items()])
+
+    def sample(self):
+        """Return a random sample from the distribution."""
+        if self.sampler is None:
+            self.sampler = weighted_sampler(list(self.dictionary.keys()), list(self.dictionary.values()))
+        return self.sampler()
+
+
+def NaiveBayesLearner(dataset, continuous=True, simple=False):
+    if simple:
+        return NaiveBayesSimple(dataset)
+    if continuous:
+        return NaiveBayesContinuous(dataset)
+    else:
+        return NaiveBayesDiscrete(dataset)
+
+
+def NaiveBayesSimple(distribution):
+    """
+    A simple naive bayes classifier that takes as input a dictionary of
+    CountingProbDist objects and classifies items according to these distributions.
+    The input dictionary is in the following form:
+        (ClassName, ClassProb): CountingProbDist
+    """
+    target_dist = {c_name: prob for c_name, prob in distribution.keys()}
+    attr_dists = {c_name: count_prob for (c_name, _), count_prob in distribution.items()}
+
+    def predict(example):
+        """Predict the target value for example. Calculate probabilities for each
+        class and pick the max."""
+
+        def class_probability(target_val):
+            attr_dist = attr_dists[target_val]
+            return target_dist[target_val] * product(attr_dist[a] for a in example)
+
+        return argmax(target_dist.keys(), key=class_probability)
+
+    return predict
+
+
+def NaiveBayesDiscrete(dataset):
+    """
+    Just count how many times each value of each input attribute
+    occurs, conditional on the target value. Count the different
+    target values too.
+    """
+
+    target_vals = dataset.values[dataset.target]
+    target_dist = CountingProbDist(target_vals)
+    attr_dists = {(gv, attr): CountingProbDist(dataset.values[attr]) for gv in target_vals for attr in dataset.inputs}
+    for example in dataset.examples:
+        target_val = example[dataset.target]
+        target_dist.add(target_val)
+        for attr in dataset.inputs:
+            attr_dists[target_val, attr].add(example[attr])
+
+    def predict(example):
+        """
+        Predict the target value for example. Consider each possible value,
+        and pick the most likely by looking at each attribute independently.
+        """
+
+        def class_probability(target_val):
+            return (target_dist[target_val] * product(attr_dists[target_val, attr][example[attr]]
+                                                      for attr in dataset.inputs))
+
+        return argmax(target_vals, key=class_probability)
+
+    return predict
+
+
+def NaiveBayesContinuous(dataset):
+    """
+    Count how many times each target value occurs.
+    Also, find the means and deviations of input attribute values for each target value.
+    """
+    means, deviations = dataset.find_means_and_deviations()
+
+    target_vals = dataset.values[dataset.target]
+    target_dist = CountingProbDist(target_vals)
+
+    def predict(example):
+        """Predict the target value for example. Consider each possible value,
+        and pick the most likely by looking at each attribute independently."""
+
+        def class_probability(target_val):
+            prob = target_dist[target_val]
+            for attr in dataset.inputs:
+                prob *= gaussian(means[target_val][attr], deviations[target_val][attr], example[attr])
+            return prob
+
+        return argmax(target_vals, key=class_probability)
+
+    return predict
diff --git a/reinforcement_learning.ipynb b/reinforcement_learning.ipynb
index a8f6adc2c..ee3b6a5eb 100644
--- a/reinforcement_learning.ipynb
+++ b/reinforcement_learning.ipynb
@@ -17,7 +17,7 @@
    },
    "outputs": [],
    "source": [
-    "from rl import *"
+    "from reinforcement_learning import *"
    ]
   },
   {
@@ -628,8 +628,17 @@
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
    "version": "3.6.3"
+  },
+  "pycharm": {
+   "stem_cell": {
+    "cell_type": "raw",
+    "source": [],
+    "metadata": {
+     "collapsed": false
+    }
+   }
   }
  },
  "nbformat": 4,
  "nbformat_minor": 1
-}
+}
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index ce8246bfa..5a6603dd8 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,6 @@
 pytest
 sortedcontainers
-networkx==1.11
+networkx
 jupyter
 pandas
 matplotlib
diff --git a/tests/test_agents.py b/tests/test_agents.py
index 64e8dc209..3b3182389 100644
--- a/tests/test_agents.py
+++ b/tests/test_agents.py
@@ -4,11 +4,10 @@
 
 from agents import Agent
 from agents import Direction
-from agents import ReflexVacuumAgent, ModelBasedVacuumAgent, TrivialVacuumEnvironment, compare_agents, \
-    RandomVacuumAgent, TableDrivenVacuumAgent, TableDrivenAgentProgram, RandomAgentProgram, \
-    SimpleReflexAgentProgram, ModelBasedReflexAgentProgram
-from agents import Wall, Gold, Explorer, Thing, Bump, Glitter, WumpusEnvironment, Pit, \
-    VacuumEnvironment, Dirt
+from agents import (ReflexVacuumAgent, ModelBasedVacuumAgent, TrivialVacuumEnvironment, compare_agents,
+                    RandomVacuumAgent, TableDrivenVacuumAgent, TableDrivenAgentProgram, RandomAgentProgram,
+                    SimpleReflexAgentProgram, ModelBasedReflexAgentProgram, Wall, Gold, Explorer, Thing, Bump, Glitter,
+                    WumpusEnvironment, Pit, VacuumEnvironment, Dirt)
 
 random.seed("aima-python")
 
@@ -61,7 +60,7 @@ def test_add():
 
 
 def test_RandomAgentProgram():
-    # create a list of all the actions a vacuum cleaner can perform
+    # create a list of all the actions a Vacuum cleaner can perform
     list = ['Right', 'Left', 'Suck', 'NoOp']
     # create a program and then an object of the RandomAgentProgram
     program = RandomAgentProgram(list)
@@ -102,8 +101,7 @@ def test_TableDrivenAgent():
              ((loc_B, 'Clean'), (loc_A, 'Dirty')): 'Suck',
              ((loc_B, 'Dirty'), (loc_B, 'Clean')): 'Left',
              ((loc_A, 'Dirty'), (loc_A, 'Clean'), (loc_B, 'Dirty')): 'Suck',
-             ((loc_B, 'Dirty'), (loc_B, 'Clean'), (loc_A, 'Dirty')): 'Suck'
-             }
+             ((loc_B, 'Dirty'), (loc_B, 'Clean'), (loc_A, 'Dirty')): 'Suck'}
 
     # create an program and then an object of the TableDrivenAgent
     program = TableDrivenAgentProgram(table)
@@ -185,7 +183,7 @@ def matches(self, state):
     loc_A = (0, 0)
     loc_B = (1, 0)
 
-    # create rules for a two-state vacuum environment
+    # create rules for a two-state Vacuum Environment
     rules = [Rule((loc_A, "Dirty"), "Suck"), Rule((loc_A, "Clean"), "Right"),
              Rule((loc_B, "Dirty"), "Suck"), Rule((loc_B, "Clean"), "Left")]
 
@@ -236,8 +234,8 @@ def test_compare_agents():
     agents = [ModelBasedVacuumAgent, ReflexVacuumAgent]
 
     result = compare_agents(environment, agents)
-    performance_ModelBasedVacummAgent = result[0][1]
-    performance_ReflexVacummAgent = result[1][1]
+    performance_ModelBasedVacuumAgent = result[0][1]
+    performance_ReflexVacuumAgent = result[1][1]
 
     # The performance of ModelBasedVacuumAgent will be at least as good as that of
     # ReflexVacuumAgent, since ModelBasedVacuumAgent can identify when it has
@@ -245,7 +243,7 @@ def test_compare_agents():
     # NoOp leading to 0 performance change, whereas ReflexVacuumAgent cannot
     # identify the terminal state and thus will keep moving, leading to worse
     # performance compared to ModelBasedVacuumAgent.
-    assert performance_ReflexVacummAgent <= performance_ModelBasedVacummAgent
+    assert performance_ReflexVacuumAgent <= performance_ModelBasedVacuumAgent
 
 
 def test_TableDrivenAgentProgram():
@@ -254,8 +252,7 @@ def test_TableDrivenAgentProgram():
              (('bar', 1),): 'action3',
              (('bar', 2),): 'action1',
              (('foo', 1), ('foo', 1),): 'action2',
-             (('foo', 1), ('foo', 2),): 'action3',
-             }
+             (('foo', 1), ('foo', 2),): 'action3'}
     agent_program = TableDrivenAgentProgram(table)
     assert agent_program(('foo', 1)) == 'action1'
     assert agent_program(('foo', 2)) == 'action3'
@@ -272,19 +269,19 @@ def constant_prog(percept):
 
 
 def test_VacuumEnvironment():
-    # Initialize Vacuum Environment
+    # initialize Vacuum Environment
     v = VacuumEnvironment(6, 6)
-    # Get an agent
+    # get an agent
     agent = ModelBasedVacuumAgent()
     agent.direction = Direction(Direction.R)
     v.add_thing(agent)
     v.add_thing(Dirt(), location=(2, 1))
 
-    # Check if things are added properly
+    # check if things are added properly
     assert len([x for x in v.things if isinstance(x, Wall)]) == 20
     assert len([x for x in v.things if isinstance(x, Dirt)]) == 1
 
-    # Let the action begin!
+    # let the action begin!
     assert v.percept(agent) == ("Clean", "None")
     v.execute_action(agent, "Forward")
     assert v.percept(agent) == ("Dirty", "None")
@@ -302,38 +299,37 @@ def test_WumpusEnvironment():
     def constant_prog(percept):
         return percept
 
-    # Initialize Wumpus Environment
+    # initialize Wumpus Environment
     w = WumpusEnvironment(constant_prog)
 
-    # Check if things are added properly
+    # check if things are added properly
     assert len([x for x in w.things if isinstance(x, Wall)]) == 20
     assert any(map(lambda x: isinstance(x, Gold), w.things))
     assert any(map(lambda x: isinstance(x, Explorer), w.things))
     assert not any(map(lambda x: not isinstance(x, Thing), w.things))
 
-    # Check that gold and wumpus are not present on (1,1)
-    assert not any(map(lambda x: isinstance(x, Gold) or isinstance(x, WumpusEnvironment),
-                       w.list_things_at((1, 1))))
+    # check that gold and wumpus are not present on (1,1)
+    assert not any(map(lambda x: isinstance(x, Gold) or isinstance(x, WumpusEnvironment), w.list_things_at((1, 1))))
 
-    # Check if w.get_world() segments objects correctly
+    # check if w.get_world() segments objects correctly
     assert len(w.get_world()) == 6
     for row in w.get_world():
         assert len(row) == 6
 
-    # Start the game!
+    # start the game!
     agent = [x for x in w.things if isinstance(x, Explorer)][0]
     gold = [x for x in w.things if isinstance(x, Gold)][0]
     pit = [x for x in w.things if isinstance(x, Pit)][0]
 
     assert not w.is_done()
 
-    # Check Walls
+    # check Walls
     agent.location = (1, 2)
     percepts = w.percept(agent)
     assert len(percepts) == 5
     assert any(map(lambda x: isinstance(x, Bump), percepts[0]))
 
-    # Check Gold
+    # check Gold
     agent.location = gold.location
     percepts = w.percept(agent)
     assert any(map(lambda x: isinstance(x, Glitter), percepts[4]))
@@ -341,7 +337,7 @@ def constant_prog(percept):
     percepts = w.percept(agent)
     assert not any(map(lambda x: isinstance(x, Glitter), percepts[4]))
 
-    # Check agent death
+    # check agent death
     agent.location = pit.location
     assert w.in_danger(agent)
     assert not agent.alive
@@ -355,7 +351,7 @@ def test_WumpusEnvironmentActions():
     def constant_prog(percept):
         return percept
 
-    # Initialize Wumpus Environment
+    # initialize Wumpus Environment
     w = WumpusEnvironment(constant_prog)
 
     agent = [x for x in w.things if isinstance(x, Explorer)][0]
diff --git a/tests/test_agents4e.py b/tests/test_agents4e.py
index d94a86141..a84e67e7f 100644
--- a/tests/test_agents4e.py
+++ b/tests/test_agents4e.py
@@ -4,10 +4,9 @@
 
 from agents4e import Agent, WumpusEnvironment, Explorer, Thing, Gold, Pit, Bump, Glitter
 from agents4e import Direction
-from agents4e import ReflexVacuumAgent, ModelBasedVacuumAgent, TrivialVacuumEnvironment, compare_agents, \
-    RandomVacuumAgent, TableDrivenVacuumAgent, TableDrivenAgentProgram, RandomAgentProgram, \
-    SimpleReflexAgentProgram, ModelBasedReflexAgentProgram
-from agents4e import Wall, VacuumEnvironment, Dirt
+from agents4e import (ReflexVacuumAgent, ModelBasedVacuumAgent, TrivialVacuumEnvironment, compare_agents,
+                      RandomVacuumAgent, TableDrivenVacuumAgent, TableDrivenAgentProgram, RandomAgentProgram,
+                      SimpleReflexAgentProgram, ModelBasedReflexAgentProgram, Wall, VacuumEnvironment, Dirt)
 
 random.seed("aima-python")
 
@@ -60,7 +59,7 @@ def test_add():
 
 
 def test_RandomAgentProgram():
-    # create a list of all the actions a vacuum cleaner can perform
+    # create a list of all the actions a Vacuum cleaner can perform
     list = ['Right', 'Left', 'Suck', 'NoOp']
     # create a program and then an object of the RandomAgentProgram
     program = RandomAgentProgram(list)
@@ -101,8 +100,7 @@ def test_TableDrivenAgent():
              ((loc_B, 'Clean'), (loc_A, 'Dirty')): 'Suck',
              ((loc_B, 'Dirty'), (loc_B, 'Clean')): 'Left',
              ((loc_A, 'Dirty'), (loc_A, 'Clean'), (loc_B, 'Dirty')): 'Suck',
-             ((loc_B, 'Dirty'), (loc_B, 'Clean'), (loc_A, 'Dirty')): 'Suck'
-             }
+             ((loc_B, 'Dirty'), (loc_B, 'Clean'), (loc_A, 'Dirty')): 'Suck'}
 
     # create an program and then an object of the TableDrivenAgent
     program = TableDrivenAgentProgram(table)
@@ -183,7 +181,7 @@ def matches(self, state):
     loc_A = (0, 0)
     loc_B = (1, 0)
 
-    # create rules for a two-state vacuum environment
+    # create rules for a two-state Vacuum Environment
     rules = [Rule((loc_A, "Dirty"), "Suck"), Rule((loc_A, "Clean"), "Right"),
              Rule((loc_B, "Dirty"), "Suck"), Rule((loc_B, "Clean"), "Left")]
 
@@ -234,8 +232,8 @@ def test_compare_agents():
     agents = [ModelBasedVacuumAgent, ReflexVacuumAgent]
 
     result = compare_agents(environment, agents)
-    performance_ModelBasedVacummAgent = result[0][1]
-    performance_ReflexVacummAgent = result[1][1]
+    performance_ModelBasedVacuumAgent = result[0][1]
+    performance_ReflexVacuumAgent = result[1][1]
 
     # The performance of ModelBasedVacuumAgent will be at least as good as that of
     # ReflexVacuumAgent, since ModelBasedVacuumAgent can identify when it has
@@ -243,7 +241,7 @@ def test_compare_agents():
     # NoOp leading to 0 performance change, whereas ReflexVacuumAgent cannot
     # identify the terminal state and thus will keep moving, leading to worse
     # performance compared to ModelBasedVacuumAgent.
-    assert performance_ReflexVacummAgent <= performance_ModelBasedVacummAgent
+    assert performance_ReflexVacuumAgent <= performance_ModelBasedVacuumAgent
 
 
 def test_TableDrivenAgentProgram():
@@ -252,12 +250,11 @@ def test_TableDrivenAgentProgram():
              (('bar', 1),): 'action3',
              (('bar', 2),): 'action1',
              (('foo', 1), ('foo', 1),): 'action2',
-             (('foo', 1), ('foo', 2),): 'action3',
-             }
+             (('foo', 1), ('foo', 2),): 'action3'}
     agent_program = TableDrivenAgentProgram(table)
     assert agent_program(('foo', 1)) == 'action1'
     assert agent_program(('foo', 2)) == 'action3'
-    assert agent_program(('invalid percept',)) == None
+    assert agent_program(('invalid percept',)) is None
 
 
 def test_Agent():
@@ -270,19 +267,19 @@ def constant_prog(percept):
 
 
 def test_VacuumEnvironment():
-    # Initialize Vacuum Environment
+    # initialize Vacuum Environment
     v = VacuumEnvironment(6, 6)
-    # Get an agent
+    # get an agent
     agent = ModelBasedVacuumAgent()
     agent.direction = Direction(Direction.R)
     v.add_thing(agent)
     v.add_thing(Dirt(), location=(2, 1))
 
-    # Check if things are added properly
+    # check if things are added properly
     assert len([x for x in v.things if isinstance(x, Wall)]) == 20
     assert len([x for x in v.things if isinstance(x, Dirt)]) == 1
 
-    # Let the action begin!
+    # let the action begin!
     assert v.percept(agent) == ("Clean", "None")
     v.execute_action(agent, "Forward")
     assert v.percept(agent) == ("Dirty", "None")
@@ -300,37 +297,37 @@ def test_WumpusEnvironment():
     def constant_prog(percept):
         return percept
 
-    # Initialize Wumpus Environment
+    # initialize Wumpus Environment
     w = WumpusEnvironment(constant_prog)
 
-    # Check if things are added properly
+    # check if things are added properly
     assert len([x for x in w.things if isinstance(x, Wall)]) == 20
     assert any(map(lambda x: isinstance(x, Gold), w.things))
     assert any(map(lambda x: isinstance(x, Explorer), w.things))
     assert not any(map(lambda x: not isinstance(x, Thing), w.things))
 
-    # Check that gold and wumpus are not present on (1,1)
+    # check that gold and wumpus are not present on (1,1)
     assert not any(map(lambda x: isinstance(x, Gold) or isinstance(x, WumpusEnvironment), w.list_things_at((1, 1))))
 
-    # Check if w.get_world() segments objects correctly
+    # check if w.get_world() segments objects correctly
     assert len(w.get_world()) == 6
     for row in w.get_world():
         assert len(row) == 6
 
-    # Start the game!
+    # start the game!
     agent = [x for x in w.things if isinstance(x, Explorer)][0]
     gold = [x for x in w.things if isinstance(x, Gold)][0]
     pit = [x for x in w.things if isinstance(x, Pit)][0]
 
     assert not w.is_done()
 
-    # Check Walls
+    # check Walls
     agent.location = (1, 2)
     percepts = w.percept(agent)
     assert len(percepts) == 5
     assert any(map(lambda x: isinstance(x, Bump), percepts[0]))
 
-    # Check Gold
+    # check Gold
     agent.location = gold.location
     percepts = w.percept(agent)
     assert any(map(lambda x: isinstance(x, Glitter), percepts[4]))
@@ -338,7 +335,7 @@ def constant_prog(percept):
     percepts = w.percept(agent)
     assert not any(map(lambda x: isinstance(x, Glitter), percepts[4]))
 
-    # Check agent death
+    # check agent death
     agent.location = pit.location
     assert w.in_danger(agent)
     assert not agent.alive
@@ -352,7 +349,7 @@ def test_WumpusEnvironmentActions():
     def constant_prog(percept):
         return percept
 
-    # Initialize Wumpus Environment
+    # initialize Wumpus Environment
     w = WumpusEnvironment(constant_prog)
 
     agent = [x for x in w.things if isinstance(x, Explorer)][0]
diff --git a/tests/test_deep_learning4e.py b/tests/test_deep_learning4e.py
index d0a05bc49..2a611076c 100644
--- a/tests/test_deep_learning4e.py
+++ b/tests/test_deep_learning4e.py
@@ -9,11 +9,11 @@
 
 
 def test_neural_net():
-    iris = DataSet(name="iris")
-    classes = ["setosa", "versicolor", "virginica"]
+    iris = DataSet(name='iris')
+    classes = ['setosa', 'versicolor', 'virginica']
     iris.classes_to_numbers(classes)
-    nn_adam = neural_net_learner(iris, [4], learning_rate=0.001, epochs=200, optimizer=adam_optimizer)
-    nn_gd = neural_net_learner(iris, [4], learning_rate=0.15, epochs=100, optimizer=gradient_descent)
+    nnl_adam = NeuralNetLearner(iris, [4], learning_rate=0.001, epochs=200, optimizer=adam_optimizer)
+    nnl_gd = NeuralNetLearner(iris, [4], learning_rate=0.15, epochs=100, optimizer=gradient_descent)
     tests = [([5.0, 3.1, 0.9, 0.1], 0),
              ([5.1, 3.5, 1.0, 0.0], 0),
              ([4.9, 3.3, 1.1, 0.1], 0),
@@ -23,25 +23,25 @@ def test_neural_net():
              ([7.5, 4.1, 6.2, 2.3], 2),
              ([7.3, 4.0, 6.1, 2.4], 2),
              ([7.0, 3.3, 6.1, 2.5], 2)]
-    assert grade_learner(nn_adam, tests) >= 1 / 3
-    assert grade_learner(nn_gd, tests) >= 1 / 3
-    assert err_ratio(nn_adam, iris) < 0.21
-    assert err_ratio(nn_gd, iris) < 0.21
+    assert grade_learner(nnl_adam, tests) >= 1 / 3
+    assert grade_learner(nnl_gd, tests) >= 1 / 3
+    assert err_ratio(nnl_adam, iris) < 0.21
+    assert err_ratio(nnl_gd, iris) < 0.21
 
 
 def test_perceptron():
-    iris = DataSet(name="iris")
-    classes = ["setosa", "versicolor", "virginica"]
+    iris = DataSet(name='iris')
+    classes = ['setosa', 'versicolor', 'virginica']
     iris.classes_to_numbers(classes)
-    perceptron = perceptron_learner(iris, learning_rate=0.01, epochs=100)
+    pl = PerceptronLearner(iris, learning_rate=0.01, epochs=100)
     tests = [([5, 3, 1, 0.1], 0),
              ([5, 3.5, 1, 0], 0),
              ([6, 3, 4, 1.1], 1),
              ([6, 2, 3.5, 1], 1),
              ([7.5, 4, 6, 2], 2),
              ([7, 3, 6, 2.5], 2)]
-    assert grade_learner(perceptron, tests) > 1 / 2
-    assert err_ratio(perceptron, iris) < 0.4
+    assert grade_learner(pl, tests) > 1 / 2
+    assert err_ratio(pl, iris) < 0.4
 
 
 def test_rnn():
@@ -49,20 +49,19 @@ def test_rnn():
     train, val, test = keras_dataset_loader(data)
     train = (train[0][:1000], train[1][:1000])
     val = (val[0][:200], val[1][:200])
-    model = simple_rnn_learner(train, val)
-    score = model.evaluate(test[0][:200], test[1][:200], verbose=0)
-    acc = score[1]
-    assert acc >= 0.3
+    rnn = SimpleRNNLearner(train, val)
+    score = rnn.evaluate(test[0][:200], test[1][:200], verbose=0)
+    assert score[1] >= 0.3
 
 
 def test_auto_encoder():
-    iris = DataSet(name="iris")
-    classes = ["setosa", "versicolor", "virginica"]
+    iris = DataSet(name='iris')
+    classes = ['setosa', 'versicolor', 'virginica']
     iris.classes_to_numbers(classes)
     inputs = np.asarray(iris.examples)
-    model = auto_encoder_learner(inputs, 100)
+    al = AutoencoderLearner(inputs, 100)
     print(inputs[0])
-    print(model.predict(inputs[:1]))
+    print(al.predict(inputs[:1]))
 
 
 if __name__ == "__main__":
diff --git a/tests/test_learning.py b/tests/test_learning.py
index 1cf24984f..1590a4d33 100644
--- a/tests/test_learning.py
+++ b/tests/test_learning.py
@@ -11,8 +11,8 @@ def test_exclude():
 
 
 def test_parse_csv():
-    Iris = open_data('iris.csv').read()
-    assert parse_csv(Iris)[0] == [5.1, 3.5, 1.4, 0.2, 'setosa']
+    iris = open_data('iris.csv').read()
+    assert parse_csv(iris)[0] == [5.1, 3.5, 1.4, 0.2, 'setosa']
 
 
 def test_weighted_mode():
@@ -24,99 +24,37 @@ def test_weighted_replicate():
 
 
 def test_means_and_deviation():
-    iris = DataSet(name="iris")
-
+    iris = DataSet(name='iris')
     means, deviations = iris.find_means_and_deviations()
-
-    assert round(means["setosa"][0], 3) == 5.006
-    assert round(means["versicolor"][0], 3) == 5.936
-    assert round(means["virginica"][0], 3) == 6.588
-
-    assert round(deviations["setosa"][0], 3) == 0.352
-    assert round(deviations["versicolor"][0], 3) == 0.516
-    assert round(deviations["virginica"][0], 3) == 0.636
+    assert round(means['setosa'][0], 3) == 5.006
+    assert round(means['versicolor'][0], 3) == 5.936
+    assert round(means['virginica'][0], 3) == 6.588
+    assert round(deviations['setosa'][0], 3) == 0.352
+    assert round(deviations['versicolor'][0], 3) == 0.516
+    assert round(deviations['virginica'][0], 3) == 0.636
 
 
 def test_plurality_learner():
-    zoo = DataSet(name="zoo")
-
-    pL = PluralityLearner(zoo)
-    assert pL([1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 4, 1, 0, 1]) == "mammal"
-
-
-def test_naive_bayes():
-    iris = DataSet(name="iris")
-
-    # Discrete
-    nBD = NaiveBayesLearner(iris, continuous=False)
-    assert nBD([5, 3, 1, 0.1]) == "setosa"
-    assert nBD([6, 3, 4, 1.1]) == "versicolor"
-    assert nBD([7.7, 3, 6, 2]) == "virginica"
-
-    # Continuous
-    nBC = NaiveBayesLearner(iris, continuous=True)
-    assert nBC([5, 3, 1, 0.1]) == "setosa"
-    assert nBC([6, 5, 3, 1.5]) == "versicolor"
-    assert nBC([7, 3, 6.5, 2]) == "virginica"
-
-    # Simple
-    data1 = 'a' * 50 + 'b' * 30 + 'c' * 15
-    dist1 = CountingProbDist(data1)
-    data2 = 'a' * 30 + 'b' * 45 + 'c' * 20
-    dist2 = CountingProbDist(data2)
-    data3 = 'a' * 20 + 'b' * 20 + 'c' * 35
-    dist3 = CountingProbDist(data3)
-
-    dist = {('First', 0.5): dist1, ('Second', 0.3): dist2, ('Third', 0.2): dist3}
-    nBS = NaiveBayesLearner(dist, simple=True)
-    assert nBS('aab') == 'First'
-    assert nBS(['b', 'b']) == 'Second'
-    assert nBS('ccbcc') == 'Third'
+    zoo = DataSet(name='zoo')
+    pl = PluralityLearner(zoo)
+    assert pl([1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 4, 1, 0, 1]) == 'mammal'
 
 
 def test_k_nearest_neighbors():
-    iris = DataSet(name="iris")
-    kNN = NearestNeighborLearner(iris, k=3)
-    assert kNN([5, 3, 1, 0.1]) == "setosa"
-    assert kNN([5, 3, 1, 0.1]) == "setosa"
-    assert kNN([6, 5, 3, 1.5]) == "versicolor"
-    assert kNN([7.5, 4, 6, 2]) == "virginica"
-
-
-def test_truncated_svd():
-    test_mat = [[17, 0],
-                [0, 11]]
-    _, _, eival = truncated_svd(test_mat)
-    assert isclose(eival[0], 17)
-    assert isclose(eival[1], 11)
-
-    test_mat = [[17, 0],
-                [0, -34]]
-    _, _, eival = truncated_svd(test_mat)
-    assert isclose(eival[0], 34)
-    assert isclose(eival[1], 17)
-
-    test_mat = [[1, 0, 0, 0, 2],
-                [0, 0, 3, 0, 0],
-                [0, 0, 0, 0, 0],
-                [0, 2, 0, 0, 0]]
-    _, _, eival = truncated_svd(test_mat)
-    assert isclose(eival[0], 3)
-    assert isclose(eival[1], 5 ** 0.5)
-
-    test_mat = [[3, 2, 2],
-                [2, 3, -2]]
-    _, _, eival = truncated_svd(test_mat)
-    assert isclose(eival[0], 5)
-    assert isclose(eival[1], 3)
+    iris = DataSet(name='iris')
+    knn = NearestNeighborLearner(iris, k=3)
+    assert knn([5, 3, 1, 0.1]) == 'setosa'
+    assert knn([5, 3, 1, 0.1]) == 'setosa'
+    assert knn([6, 5, 3, 1.5]) == 'versicolor'
+    assert knn([7.5, 4, 6, 2]) == 'virginica'
 
 
 def test_decision_tree_learner():
-    iris = DataSet(name="iris")
-    dTL = DecisionTreeLearner(iris)
-    assert dTL([5, 3, 1, 0.1]) == "setosa"
-    assert dTL([6, 5, 3, 1.5]) == "versicolor"
-    assert dTL([7.5, 4, 6, 2]) == "virginica"
+    iris = DataSet(name='iris')
+    dtl = DecisionTreeLearner(iris)
+    assert dtl([5, 3, 1, 0.1]) == 'setosa'
+    assert dtl([6, 5, 3, 1.5]) == 'versicolor'
+    assert dtl([7.5, 4, 6, 2]) == 'virginica'
 
 
 def test_information_content():
@@ -129,22 +67,22 @@ def test_information_content():
 
 
 def test_random_forest():
-    iris = DataSet(name="iris")
-    rF = RandomForest(iris)
-    tests = [([5.0, 3.0, 1.0, 0.1], "setosa"),
-             ([5.1, 3.3, 1.1, 0.1], "setosa"),
-             ([6.0, 5.0, 3.0, 1.0], "versicolor"),
-             ([6.1, 2.2, 3.5, 1.0], "versicolor"),
-             ([7.5, 4.1, 6.2, 2.3], "virginica"),
-             ([7.3, 3.7, 6.1, 2.5], "virginica")]
-    assert grade_learner(rF, tests) >= 1 / 3
+    iris = DataSet(name='iris')
+    rf = RandomForest(iris)
+    tests = [([5.0, 3.0, 1.0, 0.1], 'setosa'),
+             ([5.1, 3.3, 1.1, 0.1], 'setosa'),
+             ([6.0, 5.0, 3.0, 1.0], 'versicolor'),
+             ([6.1, 2.2, 3.5, 1.0], 'versicolor'),
+             ([7.5, 4.1, 6.2, 2.3], 'virginica'),
+             ([7.3, 3.7, 6.1, 2.5], 'virginica')]
+    assert grade_learner(rf, tests) >= 1 / 3
 
 
 def test_neural_network_learner():
-    iris = DataSet(name="iris")
-    classes = ["setosa", "versicolor", "virginica"]
+    iris = DataSet(name='iris')
+    classes = ['setosa', 'versicolor', 'virginica']
     iris.classes_to_numbers(classes)
-    nNL = NeuralNetLearner(iris, [5], 0.15, 75)
+    nnl = NeuralNetLearner(iris, [5], 0.15, 75)
     tests = [([5.0, 3.1, 0.9, 0.1], 0),
              ([5.1, 3.5, 1.0, 0.0], 0),
              ([4.9, 3.3, 1.1, 0.1], 0),
@@ -154,22 +92,22 @@ def test_neural_network_learner():
              ([7.5, 4.1, 6.2, 2.3], 2),
              ([7.3, 4.0, 6.1, 2.4], 2),
              ([7.0, 3.3, 6.1, 2.5], 2)]
-    assert grade_learner(nNL, tests) >= 1 / 3
-    assert err_ratio(nNL, iris) < 0.21
+    assert grade_learner(nnl, tests) >= 1 / 3
+    assert err_ratio(nnl, iris) < 0.21
 
 
 def test_perceptron():
-    iris = DataSet(name="iris")
+    iris = DataSet(name='iris')
     iris.classes_to_numbers()
-    perceptron = PerceptronLearner(iris)
+    pl = PerceptronLearner(iris)
     tests = [([5, 3, 1, 0.1], 0),
              ([5, 3.5, 1, 0], 0),
              ([6, 3, 4, 1.1], 1),
              ([6, 2, 3.5, 1], 1),
              ([7.5, 4, 6, 2], 2),
              ([7, 3, 6, 2.5], 2)]
-    assert grade_learner(perceptron, tests) > 1 / 2
-    assert err_ratio(perceptron, iris) < 0.4
+    assert grade_learner(pl, tests) > 1 / 2
+    assert err_ratio(pl, iris) < 0.4
 
 
 def test_random_weights():
@@ -182,20 +120,19 @@ def test_random_weights():
         assert min_value <= weight <= max_value
 
 
-def test_adaBoost():
-    iris = DataSet(name="iris")
+def test_ada_boost():
+    iris = DataSet(name='iris')
     iris.classes_to_numbers()
-    WeightedPerceptron = WeightedLearner(PerceptronLearner)
-    AdaBoostLearner = AdaBoost(WeightedPerceptron, 5)
-    adaBoost = AdaBoostLearner(iris)
+    wl = WeightedLearner(PerceptronLearner)
+    ab = ada_boost(iris, wl, 5)
     tests = [([5, 3, 1, 0.1], 0),
              ([5, 3.5, 1, 0], 0),
              ([6, 3, 4, 1.1], 1),
              ([6, 2, 3.5, 1], 1),
              ([7.5, 4, 6, 2], 2),
              ([7, 3, 6, 2.5], 2)]
-    assert grade_learner(adaBoost, tests) > 4 / 6
-    assert err_ratio(adaBoost, iris) < 0.25
+    assert grade_learner(ab, tests) > 4 / 6
+    assert err_ratio(ab, iris) < 0.25
 
 
 if __name__ == "__main__":
diff --git a/tests/test_learning4e.py b/tests/test_learning4e.py
index 82cf835dc..987a9bffc 100644
--- a/tests/test_learning4e.py
+++ b/tests/test_learning4e.py
@@ -1,6 +1,7 @@
 import pytest
 
-from learning import *
+from deep_learning4e import PerceptronLearner
+from learning4e import *
 
 random.seed("aima-python")
 
@@ -11,8 +12,8 @@ def test_exclude():
 
 
 def test_parse_csv():
-    Iris = open_data('iris.csv').read()
-    assert parse_csv(Iris)[0] == [5.1, 3.5, 1.4, 0.2, 'setosa']
+    iris = open_data('iris.csv').read()
+    assert parse_csv(iris)[0] == [5.1, 3.5, 1.4, 0.2, 'setosa']
 
 
 def test_weighted_mode():
@@ -24,25 +25,37 @@ def test_weighted_replicate():
 
 
 def test_means_and_deviation():
-    iris = DataSet(name="iris")
-
+    iris = DataSet(name='iris')
     means, deviations = iris.find_means_and_deviations()
+    assert round(means['setosa'][0], 3) == 5.006
+    assert round(means['versicolor'][0], 3) == 5.936
+    assert round(means['virginica'][0], 3) == 6.588
+    assert round(deviations['setosa'][0], 3) == 0.352
+    assert round(deviations['versicolor'][0], 3) == 0.516
+    assert round(deviations['virginica'][0], 3) == 0.636
+
+
+def test_plurality_learner():
+    zoo = DataSet(name='zoo')
+    pl = PluralityLearner(zoo)
+    assert pl([1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 4, 1, 0, 1]) == 'mammal'
 
-    assert round(means["setosa"][0], 3) == 5.006
-    assert round(means["versicolor"][0], 3) == 5.936
-    assert round(means["virginica"][0], 3) == 6.588
 
-    assert round(deviations["setosa"][0], 3) == 0.352
-    assert round(deviations["versicolor"][0], 3) == 0.516
-    assert round(deviations["virginica"][0], 3) == 0.636
+def test_k_nearest_neighbors():
+    iris = DataSet(name='iris')
+    knn = NearestNeighborLearner(iris, k=3)
+    assert knn([5, 3, 1, 0.1]) == 'setosa'
+    assert knn([5, 3, 1, 0.1]) == 'setosa'
+    assert knn([6, 5, 3, 1.5]) == 'versicolor'
+    assert knn([7.5, 4, 6, 2]) == 'virginica'
 
 
 def test_decision_tree_learner():
-    iris = DataSet(name="iris")
-    dTL = DecisionTreeLearner(iris)
-    assert dTL([5, 3, 1, 0.1]) == "setosa"
-    assert dTL([6, 5, 3, 1.5]) == "versicolor"
-    assert dTL([7.5, 4, 6, 2]) == "virginica"
+    iris = DataSet(name='iris')
+    dtl = DecisionTreeLearner(iris)
+    assert dtl([5, 3, 1, 0.1]) == 'setosa'
+    assert dtl([6, 5, 3, 1.5]) == 'versicolor'
+    assert dtl([7.5, 4, 6, 2]) == 'virginica'
 
 
 def test_information_content():
@@ -55,15 +68,15 @@ def test_information_content():
 
 
 def test_random_forest():
-    iris = DataSet(name="iris")
-    rF = RandomForest(iris)
-    tests = [([5.0, 3.0, 1.0, 0.1], "setosa"),
-             ([5.1, 3.3, 1.1, 0.1], "setosa"),
-             ([6.0, 5.0, 3.0, 1.0], "versicolor"),
-             ([6.1, 2.2, 3.5, 1.0], "versicolor"),
-             ([7.5, 4.1, 6.2, 2.3], "virginica"),
-             ([7.3, 3.7, 6.1, 2.5], "virginica")]
-    assert grade_learner(rF, tests) >= 1 / 3
+    iris = DataSet(name='iris')
+    rf = RandomForest(iris)
+    tests = [([5.0, 3.0, 1.0, 0.1], 'setosa'),
+             ([5.1, 3.3, 1.1, 0.1], 'setosa'),
+             ([6.0, 5.0, 3.0, 1.0], 'versicolor'),
+             ([6.1, 2.2, 3.5, 1.0], 'versicolor'),
+             ([7.5, 4.1, 6.2, 2.3], 'virginica'),
+             ([7.3, 3.7, 6.1, 2.5], 'virginica')]
+    assert grade_learner(rf, tests) >= 1 / 3
 
 
 def test_random_weights():
@@ -76,20 +89,19 @@ def test_random_weights():
         assert min_value <= weight <= max_value
 
 
-def test_adaBoost():
-    iris = DataSet(name="iris")
+def test_ada_boost():
+    iris = DataSet(name='iris')
     iris.classes_to_numbers()
-    WeightedPerceptron = WeightedLearner(PerceptronLearner)
-    AdaBoostLearner = AdaBoost(WeightedPerceptron, 5)
-    adaBoost = AdaBoostLearner(iris)
+    wl = WeightedLearner(PerceptronLearner)
+    ab = ada_boost(iris, wl, 5)
     tests = [([5, 3, 1, 0.1], 0),
              ([5, 3.5, 1, 0], 0),
              ([6, 3, 4, 1.1], 1),
              ([6, 2, 3.5, 1], 1),
              ([7.5, 4, 6, 2], 2),
              ([7, 3, 6, 2.5], 2)]
-    assert grade_learner(adaBoost, tests) > 4 / 6
-    assert err_ratio(adaBoost, iris) < 0.25
+    assert grade_learner(ab, tests) > 4 / 6
+    assert err_ratio(ab, iris) < 0.25
 
 
 if __name__ == "__main__":
diff --git a/tests/test_probabilistic_learning.py b/tests/test_probabilistic_learning.py
new file mode 100644
index 000000000..bd37b6ebb
--- /dev/null
+++ b/tests/test_probabilistic_learning.py
@@ -0,0 +1,38 @@
+import random
+
+import pytest
+
+from learning import DataSet
+from probabilistic_learning import *
+
+random.seed("aima-python")
+
+
+def test_naive_bayes():
+    iris = DataSet(name='iris')
+    # discrete
+    nbd = NaiveBayesLearner(iris, continuous=False)
+    assert nbd([5, 3, 1, 0.1]) == 'setosa'
+    assert nbd([6, 3, 4, 1.1]) == 'versicolor'
+    assert nbd([7.7, 3, 6, 2]) == 'virginica'
+    # continuous
+    nbc = NaiveBayesLearner(iris, continuous=True)
+    assert nbc([5, 3, 1, 0.1]) == 'setosa'
+    assert nbc([6, 5, 3, 1.5]) == 'versicolor'
+    assert nbc([7, 3, 6.5, 2]) == 'virginica'
+    # simple
+    data1 = 'a' * 50 + 'b' * 30 + 'c' * 15
+    dist1 = CountingProbDist(data1)
+    data2 = 'a' * 30 + 'b' * 45 + 'c' * 20
+    dist2 = CountingProbDist(data2)
+    data3 = 'a' * 20 + 'b' * 20 + 'c' * 35
+    dist3 = CountingProbDist(data3)
+    dist = {('First', 0.5): dist1, ('Second', 0.3): dist2, ('Third', 0.2): dist3}
+    nbs = NaiveBayesLearner(dist, simple=True)
+    assert nbs('aab') == 'First'
+    assert nbs(['b', 'b']) == 'Second'
+    assert nbs('ccbcc') == 'Third'
+
+
+if __name__ == "__main__":
+    pytest.main()
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 5ccafe157..672784bef 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -15,17 +15,17 @@ def test_sequence():
     assert sequence(([1, 2], [3, 4], [5, 6])) == ([1, 2], [3, 4], [5, 6])
 
 
-def test_removeall_list():
-    assert removeall(4, []) == []
-    assert removeall(4, [1, 2, 3, 4]) == [1, 2, 3]
-    assert removeall(4, [4, 1, 4, 2, 3, 4, 4]) == [1, 2, 3]
-    assert removeall(1, [2, 3, 4, 5, 6]) == [2, 3, 4, 5, 6]
+def test_remove_all_list():
+    assert remove_all(4, []) == []
+    assert remove_all(4, [1, 2, 3, 4]) == [1, 2, 3]
+    assert remove_all(4, [4, 1, 4, 2, 3, 4, 4]) == [1, 2, 3]
+    assert remove_all(1, [2, 3, 4, 5, 6]) == [2, 3, 4, 5, 6]
 
 
-def test_removeall_string():
-    assert removeall('s', '') == ''
-    assert removeall('s', 'This is a test. Was a test.') == 'Thi i a tet. Wa a tet.'
-    assert removeall('a', 'artificial intelligence: a modern approach') == 'rtificil intelligence:  modern pproch'
+def test_remove_all_string():
+    assert remove_all('s', '') == ''
+    assert remove_all('s', 'This is a test. Was a test.') == 'Thi i a tet. Wa a tet.'
+    assert remove_all('a', 'artificial intelligence: a modern approach') == 'rtificil intelligence:  modern pproch'
 
 
 def test_unique():
@@ -261,6 +261,34 @@ def test_sigmoid_derivative():
     assert sigmoid_derivative(value) == -6
 
 
+def test_truncated_svd():
+    test_mat = [[17, 0],
+                [0, 11]]
+    _, _, eival = truncated_svd(test_mat)
+    assert isclose(eival[0], 17)
+    assert isclose(eival[1], 11)
+
+    test_mat = [[17, 0],
+                [0, -34]]
+    _, _, eival = truncated_svd(test_mat)
+    assert isclose(eival[0], 34)
+    assert isclose(eival[1], 17)
+
+    test_mat = [[1, 0, 0, 0, 2],
+                [0, 0, 3, 0, 0],
+                [0, 0, 0, 0, 0],
+                [0, 2, 0, 0, 0]]
+    _, _, eival = truncated_svd(test_mat)
+    assert isclose(eival[0], 3)
+    assert isclose(eival[1], 5 ** 0.5)
+
+    test_mat = [[3, 2, 2],
+                [2, 3, -2]]
+    _, _, eival = truncated_svd(test_mat)
+    assert isclose(eival[0], 5)
+    assert isclose(eival[1], 3)
+
+
 def test_weighted_choice():
     choices = [('a', 0.5), ('b', 0.3), ('c', 0.2)]
     choice = weighted_choice(choices)
@@ -340,11 +368,10 @@ def test_expr():
     assert expr('P & Q <=> Q & P') == Expr('<=>', (P & Q), (Q & P))
     assert expr('P(x) | P(y) & Q(z)') == (P(x) | (P(y) & Q(z)))
     # x is grandparent of z if x is parent of y and y is parent of z:
-    assert (expr('GP(x, z) <== P(x, y) & P(y, z)')
-            == Expr('<==', GP(x, z), P(x, y) & P(y, z)))
+    assert (expr('GP(x, z) <== P(x, y) & P(y, z)') == Expr('<==', GP(x, z), P(x, y) & P(y, z)))
 
 
-def test_min_priorityqueue():
+def test_min_priority_queue():
     queue = PriorityQueue(f=lambda x: x[1])
     queue.append((1, 100))
     queue.append((2, 30))
@@ -360,7 +387,7 @@ def test_min_priorityqueue():
     assert len(queue) == 2
 
 
-def test_max_priorityqueue():
+def test_max_priority_queue():
     queue = PriorityQueue(order='max', f=lambda x: x[1])
     queue.append((1, 100))
     queue.append((2, 30))
@@ -368,7 +395,7 @@ def test_max_priorityqueue():
     assert queue.pop() == (1, 100)
 
 
-def test_priorityqueue_with_objects():
+def test_priority_queue_with_objects():
     class Test:
         def __init__(self, a, b):
             self.a = a
diff --git a/text.py b/text.py
index 3a2d9d7aa..bf1809f96 100644
--- a/text.py
+++ b/text.py
@@ -5,7 +5,7 @@
 working on a tiny sample of Unix manual pages."""
 
 from utils import argmin, argmax, hashabledict
-from learning import CountingProbDist
+from probabilistic_learning import CountingProbDist
 import search
 
 from math import log, exp
diff --git a/utils.py b/utils.py
index 897147539..75d4547cf 100644
--- a/utils.py
+++ b/utils.py
@@ -25,7 +25,7 @@ def sequence(iterable):
             else tuple([iterable]))
 
 
-def removeall(item, seq):
+def remove_all(item, seq):
     """Return a copy of seq (or string) with all occurrences of item removed."""
     if isinstance(seq, str):
         return seq.replace(item, '')
@@ -305,7 +305,7 @@ def manhattan_distance(X, Y):
 
 
 def mean_boolean_error(X, Y):
-    return mean(int(x != y) for x, y in zip(X, Y))
+    return mean(x != y for x, y in zip(X, Y))
 
 
 def hamming_distance(X, Y):
@@ -329,6 +329,10 @@ def norm(X, n=2):
     return sum([x ** n for x in X]) ** (1 / n)
 
 
+def random_weights(min_value, max_value, num_weights):
+    return [random.uniform(min_value, max_value) for _ in range(num_weights)]
+
+
 def clip(x, lowest, highest):
     """Return x clipped to the range [lowest..highest]."""
     return max(lowest, min(x, highest))
@@ -414,6 +418,71 @@ def isclose(a, b, rel_tol=1e-09, abs_tol=0.0):
         """Return true if numbers a and b are close to each other."""
         return abs(a - b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol)
 
+
+def truncated_svd(X, num_val=2, max_iter=1000):
+    """Compute the first component of SVD."""
+
+    def normalize_vec(X, n=2):
+        """Normalize two parts (:m and m:) of the vector."""
+        X_m = X[:m]
+        X_n = X[m:]
+        norm_X_m = norm(X_m, n)
+        Y_m = [x / norm_X_m for x in X_m]
+        norm_X_n = norm(X_n, n)
+        Y_n = [x / norm_X_n for x in X_n]
+        return Y_m + Y_n
+
+    def remove_component(X):
+        """Remove components of already obtained eigen vectors from X."""
+        X_m = X[:m]
+        X_n = X[m:]
+        for eivec in eivec_m:
+            coeff = dotproduct(X_m, eivec)
+            X_m = [x1 - coeff * x2 for x1, x2 in zip(X_m, eivec)]
+        for eivec in eivec_n:
+            coeff = dotproduct(X_n, eivec)
+            X_n = [x1 - coeff * x2 for x1, x2 in zip(X_n, eivec)]
+        return X_m + X_n
+
+    m, n = len(X), len(X[0])
+    A = [[0] * (n + m) for _ in range(n + m)]
+    for i in range(m):
+        for j in range(n):
+            A[i][m + j] = A[m + j][i] = X[i][j]
+
+    eivec_m = []
+    eivec_n = []
+    eivals = []
+
+    for _ in range(num_val):
+        X = [random.random() for _ in range(m + n)]
+        X = remove_component(X)
+        X = normalize_vec(X)
+
+        for i in range(max_iter):
+            old_X = X
+            X = matrix_multiplication(A, [[x] for x in X])
+            X = [x[0] for x in X]
+            X = remove_component(X)
+            X = normalize_vec(X)
+            # check for convergence
+            if norm([x1 - x2 for x1, x2 in zip(old_X, X)]) <= 1e-10:
+                break
+
+        projected_X = matrix_multiplication(A, [[x] for x in X])
+        projected_X = [x[0] for x in projected_X]
+        new_eigenvalue = norm(projected_X, 1) / norm(X, 1)
+        ev_m = X[:m]
+        ev_n = X[m:]
+        if new_eigenvalue < 0:
+            new_eigenvalue = -new_eigenvalue
+            ev_m = [-ev_m_i for ev_m_i in ev_m]
+        eivals.append(new_eigenvalue)
+        eivec_m.append(ev_m)
+        eivec_n.append(ev_n)
+    return eivec_m, eivec_n, eivals
+
+
 # ______________________________________________________________________________
 # Grid Functions
 
diff --git a/utils4e.py b/utils4e.py
index 2681602ac..792fa9e22 100644
--- a/utils4e.py
+++ b/utils4e.py
@@ -90,7 +90,7 @@ def sequence(iterable):
             else tuple([iterable]))
 
 
-def removeall(item, seq):
+def remove_all(item, seq):
     """Return a copy of seq (or string) with all occurrences of item removed."""
     if isinstance(seq, str):
         return seq.replace(item, '')