Skip to content

Commit

Permalink
Project import generated by Copybara.
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 190121533
  • Loading branch information
tf-transform-team authored and zoyahav committed Mar 22, 2018
1 parent 4b45bfe commit 911e38a
Show file tree
Hide file tree
Showing 10 changed files with 143 additions and 82 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@ our testing framework. Other combinations may also work, but are untested.

|tensorflow-transform |tensorflow |apache-beam[gcp]|
|--------------------------------------------------------------------------------|--------------|----------------|
|[GitHub master](https://github.com/tensorflow/transform/blob/master/RELEASE.md) |nightly (1.x) |2.3.0 |
|[GitHub master](https://github.com/tensorflow/transform/blob/master/RELEASE.md) |nightly (1.x) |2.4.0 |
|[0.6.0](https://github.com/tensorflow/transform/blob/v0.6.0/RELEASE.md) |1.6 |2.4.0 |
|[0.5.0](https://github.com/tensorflow/transform/blob/v0.5.0/RELEASE.md) |1.5 |2.3.0 |
|[0.4.0](https://github.com/tensorflow/transform/blob/v0.4.0/RELEASE.md) |1.4 |2.2.0 |
|[0.3.1](https://github.com/tensorflow/transform/blob/v0.3.1/RELEASE.md) |1.3 |2.1.1 |
Expand Down
5 changes: 4 additions & 1 deletion RELEASE.md
Original file line number Diff line number Diff line change
@@ -1,17 +1,20 @@
# Current version (not yet released; still in development)
# Release 0.6.0

## Major Features and Improvements

## Bug Fixes and Other Changes
* Depends on `apache-beam[gcp]>=2.4,<3`.
* Trim min/max value in `tft.bucketize where the computed number of bucket
boundaries is more than requested. Updated documentation to clearly indicate
that the number of buckets is computed using approximate algorithms, and that
computed number can be more or less than requested.
* Change the namespace used for Beam metrics from `tensorflow_transform` to
`tfx.Transform`.
* Update Beam metrics to also log vocabulary sizes.
* `CsvCoder` updated to support unicode.

## Breaking changes
* Requires pre-installed TensorFlow >=1.6,<2.

## Deprecations

Expand Down
14 changes: 7 additions & 7 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,19 +17,19 @@
from setuptools import setup

# Tensorflow transform version.
__version__ = '0.6.0dev'
__version__ = '0.6.0'


def _make_required_install_packages():
return [
'apache-beam[gcp]>=2.3,<3',
'apache-beam[gcp]>=2.4,<3',
'numpy>=1.10,<2',

# Protobuf libraries < 3.3 contain some map-related data corruption bugs
# (b/35874111).
'protobuf>=3.3,<4',
# Protobuf libraries < 3.5.2 do not have 'cpp' implementation of protobufs
# for Windows and Mac.
'protobuf>=3.5.2,<4',

# Six 1.11.0 incompatible with apitools.
'six>=1.9,<1.11',
'six>=1.9,<2',

]

Expand Down
61 changes: 36 additions & 25 deletions tensorflow_transform/analyzers.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@

# Named tuple with details for each output of an Analyzer.
_AnalyzerOutputInfo = collections.namedtuple(
'AnalyzerOutputInfo', ['name', 'dtype', 'is_asset'])
'AnalyzerOutputInfo', ['name', 'is_asset'])


# NOTE: this code is designed so that Analyzer is pickleable, and in particular
Expand All @@ -52,6 +52,10 @@
# of a PTransform in our implementation of tf.Transform on Beam currently, so
# we must avoid directly putting `Tensor`s inside `Analyzer`, and instead use
# tensor names.
#
# Due to these pickling issues and also logical separation of TensorFlow and
# numpy code, the spec should also not contain TensorFlow dtypes but rather
# their numpy equivalent.
class Analyzer(object):
"""An operation-like class for full-pass analyses of data.
Expand Down Expand Up @@ -91,7 +95,7 @@ def __init__(self, inputs, output_dtype_shape_and_is_asset, spec, name):
raise ValueError(('Tensor {} cannot represent an asset, because it '
'is not a string.').format(output_tensor.name))
self._output_infos.append(_AnalyzerOutputInfo(
output_tensor.name, output_tensor.dtype, is_asset))
output_tensor.name, is_asset))
self._spec = spec
tf.add_to_collection(ANALYZER_COLLECTION, self)

Expand Down Expand Up @@ -201,11 +205,18 @@ def combine_analyzer(inputs, output_dtypes, output_shapes, combiner_spec, name):


class _NumPyCombinerSpec(CombinerSpec):
"""Combines the PCollection only on the 0th dimension using nparray."""
"""Combines the PCollection only on the 0th dimension using nparray.
Args:
fn: The numpy function representing the reduction to be done.
reduce_instance_dims: Whether to reduce across non-batch dimensions.
output_dtypes: The numpy dtype to cast each output to.
"""

def __init__(self, fn, reduce_instance_dims):
def __init__(self, fn, reduce_instance_dims, output_dtypes):
self._fn = fn
self._reduce_instance_dims = reduce_instance_dims
self._output_dtypes = output_dtypes

def create_accumulator(self):
return None
Expand All @@ -232,7 +243,13 @@ def merge_accumulators(self, accumulators):
for sub_accumulators in zip(*accumulators)]

def extract_output(self, accumulator):
return accumulator
if accumulator is None:
return None
# For each output, cast that output to the specified type. Note there will
# be one output for each input tensor to the analyzer.
return [sub_accumulator.astype(output_dtype)
for sub_accumulator, output_dtype
in zip(accumulator, self._output_dtypes)]


def _numeric_combine(inputs, fn, reduce_instance_dims=True, name=None):
Expand Down Expand Up @@ -266,11 +283,10 @@ def _numeric_combine(inputs, fn, reduce_instance_dims=True, name=None):
# shape.
shapes = [x.shape.as_list()[1:] if x.shape.dims is not None else None
for x in inputs]
spec = _NumPyCombinerSpec(fn, reduce_instance_dims,
[x.dtype.as_numpy_dtype for x in inputs])
return combine_analyzer(
inputs,
[x.dtype for x in inputs],
shapes,
_NumPyCombinerSpec(fn, reduce_instance_dims),
inputs, [x.dtype for x in inputs], shapes, spec,
name if name is not None else fn.__name__)


Expand Down Expand Up @@ -615,22 +631,17 @@ def quantiles(x, num_buckets, epsilon, name=None):

with tf.name_scope(name, 'quantiles'):
spec = _QuantilesSpec(epsilon, num_buckets)
quantile_boundaries = Analyzer(
return Analyzer(
[x], [(spec.bucket_dtype, [1, None], False)], spec,
'quantiles').outputs[0]

# The Analyzer returns a 2d matrix of 1*num_buckets. Below, we remove
# the first dimension and return the boundaries as a simple 1d list.
return quantile_boundaries[0:1]


class _CovarianceCombinerSpec(CombinerSpec):
"""Combines the PCollection to compute the biased covariance matrix."""

def __init__(self, dtype=tf.float64):
def __init__(self, numpy_dtype=np.float64):
"""Store the dtype for np arrays/matrices for precision."""
self._output_dtype = dtype
self._np_dtype = dtype.as_numpy_dtype
self._numpy_dtype = numpy_dtype

def create_accumulator(self):
"""Create an accumulator with all zero entries."""
Expand Down Expand Up @@ -663,9 +674,9 @@ def add_input(self, accumulator, batch_values):
batch_cross_terms = np.matmul(
np.transpose(batch_value),
batch_value
).astype(self._np_dtype)
).astype(self._numpy_dtype)

batch_sum = np.array(np.sum(batch_value, axis=0), self._np_dtype)
batch_sum = np.array(np.sum(batch_value, axis=0), self._numpy_dtype)
batch_count = np.shape(batch_value)[0]

if accumulator is None:
Expand Down Expand Up @@ -725,7 +736,7 @@ def covariance(x, dtype, name=None):
Args:
x: A rank-2 `Tensor`, 0th dim are rows, 1st dim are indices in each input
vector.
dtype: numpy dtype of entries in the returned matrix.
dtype: Tensorflow dtype of entries in the returned matrix.
name: (Optional) A name for this operation.
Raises:
Expand All @@ -743,17 +754,17 @@ def covariance(x, dtype, name=None):
input_dim = x.shape.as_list()[1]
shape = (input_dim, input_dim)

spec = _CovarianceCombinerSpec(dtype)
spec = _CovarianceCombinerSpec(dtype.as_numpy_dtype)
return combine_analyzer(
[x], [dtype], [shape], spec,
name if name is not None else 'covariance')[0]


class _PCACombinerSpec(_CovarianceCombinerSpec):

def __init__(self, output_dim=None, dtype=tf.float64):
def __init__(self, output_dim=None, numpy_dtype=np.float64):
"""Store pca output dimension, and dtype for precision."""
super(_PCACombinerSpec, self).__init__(dtype=dtype)
super(_PCACombinerSpec, self).__init__(numpy_dtype=numpy_dtype)
self._output_dim = output_dim

def extract_output(self, accumulator):
Expand Down Expand Up @@ -844,7 +855,7 @@ def pca(x, output_dim, dtype, name=None):
Args:
x: A rank-2 `Tensor`, 0th dim are rows, 1st dim are indices in row vectors.
output_dim: The PCA output dimension (number of eigenvectors to return).
dtype: numpy dtype of entries in the returned matrix.
dtype: Tensorflow dtype of entries in the returned matrix.
name: (Optional) A name for this operation.
Raises:
Expand All @@ -862,7 +873,7 @@ def pca(x, output_dim, dtype, name=None):
input_dim = x.shape.as_list()[1]
shape = (input_dim, output_dim)

spec = _PCACombinerSpec(output_dim, dtype)
spec = _PCACombinerSpec(output_dim, dtype.as_numpy_dtype)
return combine_analyzer(
[x], [dtype], [shape], spec,
name if name is not None else 'pca')[0]
30 changes: 17 additions & 13 deletions tensorflow_transform/beam/analyzer_impls.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@

import apache_beam as beam

from apache_beam.typehints import Any
from apache_beam.typehints import KV
from apache_beam.typehints import List
from apache_beam.typehints import with_input_types
Expand Down Expand Up @@ -64,16 +63,14 @@ def _maybe_deserialize_tf_config(serialized_tf_config):


@with_input_types(List[np.ndarray])
@with_output_types(List[Any])
@with_output_types(List[np.ndarray])
class _AnalyzerImpl(beam.PTransform):
"""PTransform that implements a given analyzer.
_AnalyzerImpl accepts a PCollection where each element is a list of ndarrays.
Each element in this list contains a batch of values for the corresponding
input tensor of the analyzer. _AnalyzerImpl returns a PCollection containing a
single element which is a list of values. Each element should be convertible
to an ndarray via np.asarray, and the converted value will be the
corresponding output tensor of the analyzer.
single element which is a list of `ndarray`s.
_AnalyzerImpl dispatches to an implementation transform, with the same
signature as _AnalyzerImpl.
Expand Down Expand Up @@ -106,7 +103,7 @@ def _flatten_value_to_list(batch_values):


@with_input_types(List[np.ndarray])
@with_output_types(List[Any])
@with_output_types(List[np.ndarray])
class _UniquesAnalyzerImpl(beam.PTransform):
"""Saves the unique elements in a PCollection of batches."""

Expand Down Expand Up @@ -196,15 +193,15 @@ def order_by_decreasing_counts(ignored, counts_iter, store_frequency):
# Return the vocabulary path.
wait_for_vocabulary_transform = (
pcoll.pipeline
| 'CreatePath' >> beam.Create([[vocabulary_file]])
| 'CreatePath' >> beam.Create([[np.array(vocabulary_file)]])
# Ensure that the analysis returns only after the file is written.
| 'WaitForVocabularyFile' >> beam.Map(
lambda x, y: x, y=beam.pvalue.AsIter(vocab_is_written)))
return wait_for_vocabulary_transform


@with_input_types(List[np.ndarray])
@with_output_types(List[Any])
@with_output_types(List[np.ndarray])
class _ComputeQuantiles(beam.CombineFn):
"""Computes quantiles on the PCollection.
Expand All @@ -213,9 +210,11 @@ class _ComputeQuantiles(beam.CombineFn):
see also http://web.cs.ucla.edu/~weiwang/paper/SSDBM07_2.pdf
"""

def __init__(self, num_quantiles, epsilon, serialized_tf_config=None):
def __init__(self, num_quantiles, epsilon, bucket_dtype,
serialized_tf_config=None):
self._num_quantiles = num_quantiles
self._epsilon = epsilon
self._bucket_dtype = bucket_dtype
self._serialized_tf_config = serialized_tf_config

# _stamp_token is used to commit the state of the qaccumulator. In
Expand Down Expand Up @@ -297,7 +296,8 @@ def merge_accumulators(self, summaries):

def extract_output(self, summary):
if summary is self._empty_summary:
return [[[]]]
# Return an empty (1, 0) ndarray using np.zeros.
return [np.zeros(shape=(1, 0), dtype=self._bucket_dtype)]

# All relevant state about the input is captured by 'summary'
# (see comment in add_input() and merge_accumulators()).
Expand Down Expand Up @@ -330,11 +330,14 @@ def extract_output(self, summary):
# Do not trim min/max, these are part of requested boundaries.
pass

return [[buckets]]
# Convert to a (1, ?) shape array.
buckets = np.expand_dims(buckets, 0)

return [buckets]


@with_input_types(List[np.ndarray])
@with_output_types(List[Any])
@with_output_types(List[np.ndarray])
class _QuantilesAnalyzerImpl(beam.PTransform):
"""Computes the quantile buckets in a PCollection of batches."""

Expand All @@ -350,11 +353,12 @@ def expand(self, pcoll):
_ComputeQuantiles(
num_quantiles=self._spec.num_buckets,
epsilon=self._spec.epsilon,
bucket_dtype=self._spec.bucket_dtype.as_numpy_dtype,
serialized_tf_config=serialized_tf_config)))


@with_input_types(List[np.ndarray])
@with_output_types(List[Any])
@with_output_types(List[np.ndarray])
class _CombineFnWrapper(beam.CombineFn):
"""Class to wrap a analyzers._CombinerSpec as a beam.CombineFn."""

Expand Down
Loading

0 comments on commit 911e38a

Please sign in to comment.