Skip to content

Commit

Permalink
Project import generated by Copybara.
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 177753043
  • Loading branch information
tf-transform-team authored and elmer-garduno committed Dec 4, 2017
1 parent 6c83b47 commit 3b6b81b
Show file tree
Hide file tree
Showing 8 changed files with 220 additions and 346 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ releasing a new version.
|tensorflow-transform |tensorflow |apache-beam[gcp]|
|--------------------------------------------------------------------------------|--------------|----------------|
|[GitHub master](https://github.com/tensorflow/transform/blob/master/RELEASE.md) |nightly (1.x) |2.2.0 |
|[0.4.0](https://github.com/tensorflow/transform/blob/v0.4.0/RELEASE.md) |1.4 |2.2.0 |
|[0.3.1](https://github.com/tensorflow/transform/blob/v0.3.1/RELEASE.md) |1.3 |2.1.1 |
|[0.3.0](https://github.com/tensorflow/transform/blob/v0.3.0/RELEASE.md) |1.3 |2.1.1 |
|[0.1.10](https://github.com/tensorflow/transform/blob/v0.1.10/RELEASE.md) |1.0 |2.0.0 |
Expand Down
17 changes: 12 additions & 5 deletions RELEASE.md
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
# Current version (not yet released; still in development)
# Release 0.4.0

## Major Features and Improvements
* Added a combine_analyzer() that supports user provided combiner, conforming to
beam.CombinFn(). This allows users to implement custom combiners
(e.g. median), to complement analyzers (like min, max) that are
prepackaged in TFT.
* Quantiles Analyzer (`tft.quantiles`).
* Quantiles Analyzer (`tft.quantiles`), with a corresponding `tft.bucketize`
mapper.

## Bug Fixes and Other Changes
* Depends on `apache-beam[gcp]>=2.2,<3`.
Expand Down Expand Up @@ -33,9 +34,15 @@
* Some functions now introduce a new name scope when they did not before so the
names of tensors may change. This will only affect you if you directly lookup
tensors by name in the graph produced by tf.Transform.
* Various Analyzer Specs (_NumericCombineSpec, _UniquesSpec, _QuantilesSpec) are
now private. Analyzers are accessible only via the top-level TFT functions (
min, max, sum, size, mean, var, uniques, quantiles).
* Various Analyzer Specs (\_NumericCombineSpec, \_UniquesSpec, \_QuantilesSpec)
are now private. Analyzers are accessible only via the top-level TFT functions
(min, max, sum, size, mean, var, uniques, quantiles).

## Upcoming deprecations
* The `serving_input_fn`s on `tensorflow_transform/saved/input_fn_maker.py` will
be removed on a future version and should not be used on new code,
see the `examples` directory for details on how to migrate your code to define
their own serving functions.

# Release 0.3.1

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from setuptools import setup

# Tensorflow transform version.
__version__ = '0.4.0dev'
__version__ = '0.4.0'


def _make_required_install_packages():
Expand Down
181 changes: 98 additions & 83 deletions tensorflow_transform/analyzers.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@

import re

import numpy as np
import tensorflow as tf


Expand Down Expand Up @@ -102,32 +103,102 @@ def output_is_asset(self, output_tensor):
return self._output_is_asset_map[output_tensor]


class _NumericCombineSpec(object):
"""Operation to combine numeric values."""
class CombinerSpec(object):
"""Analyze using combiner function.
This object mirrors a beam.CombineFn, that will receive a beam PCollection
representing the batched input tensors.
"""

def create_accumulator(self):
"""Return a fresh, empty accumulator.
Returns: An empty accumulator. This can be an Python value.
"""
raise NotImplementedError

def add_input(self, accumulator, element):
"""Return result of folding element into accumulator.
Args:
accumulator: the current accumulator
element: the element to add, which will be an ndarray representing the
value of the input for a batch.
Returns: An accumulator that includes the additional element.
"""
raise NotImplementedError

def merge_accumulators(self, accumulators):
"""Merges several accumulators to a single accumulator value.
Args:
accumulators: the accumulators to merge
Returns: The sole merged accumulator.
"""
raise NotImplementedError

def extract_output(self, accumulator):
"""Return result of converting accumulator into the output value.
Args:
accumulator: the final accumulator value. Should be a list of ndarrays.
Returns: An ndarray representing the result of this combiner.
"""
raise NotImplementedError

MIN = 'min'
MAX = 'max'
SUM = 'sum'

def __init__(self, dtype, combiner_type, reduce_instance_dims):
self._dtype = dtype
self._combiner_type = combiner_type
def combine_analyzer(x, output_dtype, output_shape, combiner_spec, name):
"""Applies the combiner over the whole dataset.
Args:
x: An input `Tensor` or `SparseTensor`.
output_dtype: The dtype of the output of the analyzer.
output_shape: The shape of the output of the analyzer.
combiner_spec: A subclass of CombinerSpec.
name: Similar to a TF op name. Used to define a unique scope for this
analyzer, which can be used for debugging info.
Returns:
The combined values, which is a `Tensor` with type output_dtype and shape
`output_shape`. These must be compatible with the combiner_spec.
"""
return Analyzer([x], [(output_dtype, output_shape, False)], combiner_spec,
name).outputs[0]


class _NumPyCombinerSpec(CombinerSpec):
"""Combines the PCollection only on the 0th dimension using nparray."""

def __init__(self, fn, reduce_instance_dims):
self._fn = fn
self._reduce_instance_dims = reduce_instance_dims

@property
def dtype(self):
return self._dtype
def create_accumulator(self):
return None

@property
def combiner_type(self):
return self._combiner_type
def add_input(self, accumulator, next_input):
if self._reduce_instance_dims:
batch = self._fn(next_input)
else:
batch = self._fn(next_input, axis=0)
if accumulator is None:
return batch
else:
return self._fn((accumulator, batch), axis=0)

@property
def reduce_instance_dims(self):
return self._reduce_instance_dims
def merge_accumulators(self, accumulators):
# numpy's sum, min, max, etc functions operate on array-like objects, but
# not arbitrary iterables. Convert the provided accumulators into a list
return self._fn(list(accumulators), axis=0)

def extract_output(self, accumulator):
return [accumulator]


def _numeric_combine(x, combiner_type, reduce_instance_dims=True, name=None):
def _numeric_combine(x, fn, reduce_instance_dims=True, name=None):
"""Apply an analyzer with _NumericCombineSpec to given input."""
if not isinstance(x, tf.Tensor):
raise TypeError('Expected a Tensor, but got %r' % x)
Expand All @@ -143,10 +214,9 @@ def _numeric_combine(x, combiner_type, reduce_instance_dims=True, name=None):
# If reducing over batch dimensions, with unknown shape, the result will
# also have unknown shape.
shape = None
spec = _NumericCombineSpec(x.dtype, combiner_type, reduce_instance_dims)
return Analyzer(
[x], [(x.dtype, shape, False)], spec,
name if name is not None else combiner_type).outputs[0]
return combine_analyzer(
x, x.dtype, shape, _NumPyCombinerSpec(fn, reduce_instance_dims),
name if name is not None else fn.__name__)


def min(x, reduce_instance_dims=True, name=None): # pylint: disable=redefined-builtin
Expand All @@ -162,8 +232,7 @@ def min(x, reduce_instance_dims=True, name=None): # pylint: disable=redefined-b
Returns:
A `Tensor`. Has the same type as `x`.
"""
return _numeric_combine(
x, _NumericCombineSpec.MIN, reduce_instance_dims, name)
return _numeric_combine(x, np.min, reduce_instance_dims, name)


def max(x, reduce_instance_dims=True, name=None): # pylint: disable=redefined-builtin
Expand All @@ -179,8 +248,7 @@ def max(x, reduce_instance_dims=True, name=None): # pylint: disable=redefined-b
Returns:
A `Tensor`. Has the same type as `x`.
"""
return _numeric_combine(
x, _NumericCombineSpec.MAX, reduce_instance_dims, name)
return _numeric_combine(x, np.max, reduce_instance_dims, name)


def sum(x, reduce_instance_dims=True, name=None): # pylint: disable=redefined-builtin
Expand All @@ -196,8 +264,7 @@ def sum(x, reduce_instance_dims=True, name=None): # pylint: disable=redefined-b
Returns:
A `Tensor`. Has the same type as `x`.
"""
return _numeric_combine(
x, _NumericCombineSpec.SUM, reduce_instance_dims, name)
return _numeric_combine(x, np.sum, reduce_instance_dims, name)


def size(x, reduce_instance_dims=True, name=None):
Expand Down Expand Up @@ -271,18 +338,13 @@ def var(x, reduce_instance_dims=True, name=None):
class _UniquesSpec(object):
"""Operation to compute unique values."""

def __init__(self, dtype, top_k, frequency_threshold,
def __init__(self, top_k, frequency_threshold,
vocab_filename, store_frequency):
self._dtype = dtype
self._top_k = top_k
self._frequency_threshold = frequency_threshold
self._vocab_filename = vocab_filename
self._store_frequency = store_frequency

@property
def dtype(self):
return self._dtype

@property
def top_k(self):
return self._top_k
Expand Down Expand Up @@ -400,8 +462,8 @@ def uniques(x, top_k=None, frequency_threshold=None,
# Make the file name path safe.
vocab_filename = sanitized_vocab_filename(vocab_filename, prefix=prefix)

spec = _UniquesSpec(tf.string, top_k, frequency_threshold,
vocab_filename, store_frequency)
spec = _UniquesSpec(top_k, frequency_threshold, vocab_filename,
store_frequency)
return Analyzer([x], [(tf.string, [], True)], spec, 'uniques').outputs[0]


Expand Down Expand Up @@ -469,50 +531,3 @@ def quantiles(x, num_buckets, epsilon, name=None):
# Drop the fist and last quantile boundaries, so that we end-up with
# num_buckets-1 boundaries, and hence num_buckets buckets.
return quantile_boundaries[0:1, 1:-1]


class _CombinerSpec(object):
"""Analyze using combiner function.
Args:
combiner: Object of a class that implements beam.CombineFn() interface.
In addtion, the combiner class must implement a @property method called
output_dtype() that returns the tf.DType of the output of the combiner.
"""

def __init__(self, combiner):
self._combiner = combiner

@property
def combiner(self):
return self._combiner

@property
def output_dtype(self):
return self._combiner.output_dtype


def combine_analyzer(x, combiner, name=None):
"""Applies the combiner over the whole dataset.
Args:
x: An input `Tensor` or `SparseTensor`.
combiner: Object of a class that implements beam.CombineFn() interface.
In addtion, the combiner class must implement a @property method called
output_dtype() that returns the type of the output of the combiner.
name: (Optional) A name for this operation.
Returns:
The combined values as a list, where the each element in the list
is of type combiner.output_dtype().
"""

# The TF node name will be of the form:
# original_scope/{combine_analyzer|name}/{class-name-of-combiner}
with tf.name_scope(name, 'combine_analyzer'):
spec = _CombinerSpec(combiner)
return Analyzer(
[x],
[(spec.output_dtype, [1, None], False)],
spec,
type(combiner).__name__).outputs[0]
Loading

0 comments on commit 3b6b81b

Please sign in to comment.