Project import generated by Copybara.

PiperOrigin-RevId: 190121533
tensorflow · Mar 22, 2018 · 911e38a · 911e38a
1 parent 4b45bfe
commit 911e38a
Show file tree

Hide file tree

Showing 10 changed files with 143 additions and 82 deletions.
diff --git a/README.md b/README.md
@@ -44,7 +44,8 @@ our testing framework. Other combinations may also work, but are untested.
 
 |tensorflow-transform                                                            |tensorflow    |apache-beam[gcp]|
 |--------------------------------------------------------------------------------|--------------|----------------|
-|[GitHub master](https://github.com/tensorflow/transform/blob/master/RELEASE.md) |nightly (1.x) |2.3.0           |
+|[GitHub master](https://github.com/tensorflow/transform/blob/master/RELEASE.md) |nightly (1.x) |2.4.0           |
+|[0.6.0](https://github.com/tensorflow/transform/blob/v0.6.0/RELEASE.md)         |1.6           |2.4.0           |
 |[0.5.0](https://github.com/tensorflow/transform/blob/v0.5.0/RELEASE.md)         |1.5           |2.3.0           |
 |[0.4.0](https://github.com/tensorflow/transform/blob/v0.4.0/RELEASE.md)         |1.4           |2.2.0           |
 |[0.3.1](https://github.com/tensorflow/transform/blob/v0.3.1/RELEASE.md)         |1.3           |2.1.1           |

diff --git a/RELEASE.md b/RELEASE.md
@@ -1,17 +1,20 @@
-# Current version (not yet released; still in development)
+# Release 0.6.0
 
 ## Major Features and Improvements
 
 ## Bug Fixes and Other Changes
+* Depends on `apache-beam[gcp]>=2.4,<3`.
 * Trim min/max value in `tft.bucketize where the computed number of bucket
   boundaries is more than requested. Updated documentation to clearly indicate
   that the number of buckets is computed using approximate algorithms, and that
   computed number can be more or less than requested.
 * Change the namespace used for Beam metrics from `tensorflow_transform` to
   `tfx.Transform`.
 * Update Beam metrics to also log vocabulary sizes.
+* `CsvCoder` updated to support unicode.
 
 ## Breaking changes
+* Requires pre-installed TensorFlow >=1.6,<2.
 
 ## Deprecations
 

diff --git a/setup.py b/setup.py
@@ -17,19 +17,19 @@
 from setuptools import setup
 
 # Tensorflow transform version.
-__version__ = '0.6.0dev'
+__version__ = '0.6.0'
 
 
 def _make_required_install_packages():
   return [
-      'apache-beam[gcp]>=2.3,<3',
+      'apache-beam[gcp]>=2.4,<3',
+      'numpy>=1.10,<2',
 
-      # Protobuf libraries < 3.3 contain some map-related data corruption bugs
-      # (b/35874111).
-      'protobuf>=3.3,<4',
+      # Protobuf libraries < 3.5.2 do not have 'cpp' implementation of protobufs
+      # for Windows and Mac.
+      'protobuf>=3.5.2,<4',
 
-      # Six 1.11.0 incompatible with apitools.
-      'six>=1.9,<1.11',
+      'six>=1.9,<2',
 
   ]
 

diff --git a/tensorflow_transform/analyzers.py b/tensorflow_transform/analyzers.py
@@ -41,7 +41,7 @@
 
 # Named tuple with details for each output of an Analyzer.
 _AnalyzerOutputInfo = collections.namedtuple(
-    'AnalyzerOutputInfo', ['name', 'dtype', 'is_asset'])
+    'AnalyzerOutputInfo', ['name', 'is_asset'])
 
 
 # NOTE: this code is designed so that Analyzer is pickleable, and in particular
@@ -52,6 +52,10 @@
 # of a PTransform in our implementation of tf.Transform on Beam currently, so
 # we must avoid directly putting `Tensor`s inside `Analyzer`, and instead use
 # tensor names.
+#
+# Due to these pickling issues and also logical separation of TensorFlow and
+# numpy code, the spec should also not contain TensorFlow dtypes but rather
+# their numpy equivalent.
 class Analyzer(object):
   """An operation-like class for full-pass analyses of data.
 
@@ -91,7 +95,7 @@ def __init__(self, inputs, output_dtype_shape_and_is_asset, spec, name):
           raise ValueError(('Tensor {} cannot represent an asset, because it '
                             'is not a string.').format(output_tensor.name))
         self._output_infos.append(_AnalyzerOutputInfo(
-            output_tensor.name, output_tensor.dtype, is_asset))
+            output_tensor.name, is_asset))
     self._spec = spec
     tf.add_to_collection(ANALYZER_COLLECTION, self)
 
@@ -201,11 +205,18 @@ def combine_analyzer(inputs, output_dtypes, output_shapes, combiner_spec, name):
 
 
 class _NumPyCombinerSpec(CombinerSpec):
-  """Combines the PCollection only on the 0th dimension using nparray."""
+  """Combines the PCollection only on the 0th dimension using nparray.
+
+  Args:
+    fn: The numpy function representing the reduction to be done.
+    reduce_instance_dims: Whether to reduce across non-batch dimensions.
+    output_dtypes: The numpy dtype to cast each output to.
+  """
 
-  def __init__(self, fn, reduce_instance_dims):
+  def __init__(self, fn, reduce_instance_dims, output_dtypes):
     self._fn = fn
     self._reduce_instance_dims = reduce_instance_dims
+    self._output_dtypes = output_dtypes
 
   def create_accumulator(self):
     return None
@@ -232,7 +243,13 @@ def merge_accumulators(self, accumulators):
         for sub_accumulators in zip(*accumulators)]
 
   def extract_output(self, accumulator):
-    return accumulator
+    if accumulator is None:
+      return None
+    # For each output, cast that output to the specified type.  Note there will
+    # be one output for each input tensor to the analyzer.
+    return [sub_accumulator.astype(output_dtype)
+            for sub_accumulator, output_dtype
+            in zip(accumulator, self._output_dtypes)]
 
 
 def _numeric_combine(inputs, fn, reduce_instance_dims=True, name=None):
@@ -266,11 +283,10 @@ def _numeric_combine(inputs, fn, reduce_instance_dims=True, name=None):
     # shape.
     shapes = [x.shape.as_list()[1:] if x.shape.dims is not None else None
               for x in inputs]
+  spec = _NumPyCombinerSpec(fn, reduce_instance_dims,
+                            [x.dtype.as_numpy_dtype for x in inputs])
   return combine_analyzer(
-      inputs,
-      [x.dtype for x in inputs],
-      shapes,
-      _NumPyCombinerSpec(fn, reduce_instance_dims),
+      inputs, [x.dtype for x in inputs], shapes, spec,
       name if name is not None else fn.__name__)
 
 
@@ -615,22 +631,17 @@ def quantiles(x, num_buckets, epsilon, name=None):
 
   with tf.name_scope(name, 'quantiles'):
     spec = _QuantilesSpec(epsilon, num_buckets)
-    quantile_boundaries = Analyzer(
+    return Analyzer(
         [x], [(spec.bucket_dtype, [1, None], False)], spec,
         'quantiles').outputs[0]
 
-    # The Analyzer returns a 2d matrix of 1*num_buckets.  Below, we remove
-    # the first dimension and return the boundaries as a simple 1d list.
-    return quantile_boundaries[0:1]
-
 
 class _CovarianceCombinerSpec(CombinerSpec):
   """Combines the PCollection to compute the biased covariance matrix."""
 
-  def __init__(self, dtype=tf.float64):
+  def __init__(self, numpy_dtype=np.float64):
     """Store the dtype for np arrays/matrices for precision."""
-    self._output_dtype = dtype
-    self._np_dtype = dtype.as_numpy_dtype
+    self._numpy_dtype = numpy_dtype
 
   def create_accumulator(self):
     """Create an accumulator with all zero entries."""
@@ -663,9 +674,9 @@ def add_input(self, accumulator, batch_values):
     batch_cross_terms = np.matmul(
         np.transpose(batch_value),
         batch_value
-    ).astype(self._np_dtype)
+    ).astype(self._numpy_dtype)
 
-    batch_sum = np.array(np.sum(batch_value, axis=0), self._np_dtype)
+    batch_sum = np.array(np.sum(batch_value, axis=0), self._numpy_dtype)
     batch_count = np.shape(batch_value)[0]
 
     if accumulator is None:
@@ -725,7 +736,7 @@ def covariance(x, dtype, name=None):
   Args:
     x: A rank-2 `Tensor`, 0th dim are rows, 1st dim are indices in each input
     vector.
-    dtype: numpy dtype of entries in the returned matrix.
+    dtype: Tensorflow dtype of entries in the returned matrix.
     name: (Optional) A name for this operation.
 
   Raises:
@@ -743,17 +754,17 @@ def covariance(x, dtype, name=None):
   input_dim = x.shape.as_list()[1]
   shape = (input_dim, input_dim)
 
-  spec = _CovarianceCombinerSpec(dtype)
+  spec = _CovarianceCombinerSpec(dtype.as_numpy_dtype)
   return combine_analyzer(
       [x], [dtype], [shape], spec,
       name if name is not None else 'covariance')[0]
 
 
 class _PCACombinerSpec(_CovarianceCombinerSpec):
 
-  def __init__(self, output_dim=None, dtype=tf.float64):
+  def __init__(self, output_dim=None, numpy_dtype=np.float64):
     """Store pca output dimension, and dtype for precision."""
-    super(_PCACombinerSpec, self).__init__(dtype=dtype)
+    super(_PCACombinerSpec, self).__init__(numpy_dtype=numpy_dtype)
     self._output_dim = output_dim
 
   def extract_output(self, accumulator):
@@ -844,7 +855,7 @@ def pca(x, output_dim, dtype, name=None):
   Args:
     x: A rank-2 `Tensor`, 0th dim are rows, 1st dim are indices in row vectors.
     output_dim: The PCA output dimension (number of eigenvectors to return).
-    dtype: numpy dtype of entries in the returned matrix.
+    dtype: Tensorflow dtype of entries in the returned matrix.
     name: (Optional) A name for this operation.
 
   Raises:
@@ -862,7 +873,7 @@ def pca(x, output_dim, dtype, name=None):
   input_dim = x.shape.as_list()[1]
   shape = (input_dim, output_dim)
 
-  spec = _PCACombinerSpec(output_dim, dtype)
+  spec = _PCACombinerSpec(output_dim, dtype.as_numpy_dtype)
   return combine_analyzer(
       [x], [dtype], [shape], spec,
       name if name is not None else 'pca')[0]
diff --git a/tensorflow_transform/beam/analyzer_impls.py b/tensorflow_transform/beam/analyzer_impls.py
@@ -23,7 +23,6 @@
 
 import apache_beam as beam
 
-from apache_beam.typehints import Any
 from apache_beam.typehints import KV
 from apache_beam.typehints import List
 from apache_beam.typehints import with_input_types
@@ -64,16 +63,14 @@ def _maybe_deserialize_tf_config(serialized_tf_config):
 
 
 @with_input_types(List[np.ndarray])
-@with_output_types(List[Any])
+@with_output_types(List[np.ndarray])
 class _AnalyzerImpl(beam.PTransform):
   """PTransform that implements a given analyzer.
 
   _AnalyzerImpl accepts a PCollection where each element is a list of ndarrays.
   Each element in this list contains a batch of values for the corresponding
   input tensor of the analyzer. _AnalyzerImpl returns a PCollection containing a
-  single element which is a list of values.  Each element should be convertible
-  to an ndarray via np.asarray, and the converted value will be the
-  corresponding output tensor of the analyzer.
+  single element which is a list of `ndarray`s.
 
   _AnalyzerImpl dispatches to an implementation transform, with the same
   signature as _AnalyzerImpl.
@@ -106,7 +103,7 @@ def _flatten_value_to_list(batch_values):
 
 
 @with_input_types(List[np.ndarray])
-@with_output_types(List[Any])
+@with_output_types(List[np.ndarray])
 class _UniquesAnalyzerImpl(beam.PTransform):
   """Saves the unique elements in a PCollection of batches."""
 
@@ -196,15 +193,15 @@ def order_by_decreasing_counts(ignored, counts_iter, store_frequency):
     # Return the vocabulary path.
     wait_for_vocabulary_transform = (
         pcoll.pipeline
-        | 'CreatePath' >> beam.Create([[vocabulary_file]])
+        | 'CreatePath' >> beam.Create([[np.array(vocabulary_file)]])
         # Ensure that the analysis returns only after the file is written.
         | 'WaitForVocabularyFile' >> beam.Map(
             lambda x, y: x, y=beam.pvalue.AsIter(vocab_is_written)))
     return wait_for_vocabulary_transform
 
 
 @with_input_types(List[np.ndarray])
-@with_output_types(List[Any])
+@with_output_types(List[np.ndarray])
 class _ComputeQuantiles(beam.CombineFn):
   """Computes quantiles on the PCollection.
 
@@ -213,9 +210,11 @@ class _ComputeQuantiles(beam.CombineFn):
   see also http://web.cs.ucla.edu/~weiwang/paper/SSDBM07_2.pdf
   """
 
-  def __init__(self, num_quantiles, epsilon, serialized_tf_config=None):
+  def __init__(self, num_quantiles, epsilon, bucket_dtype,
+               serialized_tf_config=None):
     self._num_quantiles = num_quantiles
     self._epsilon = epsilon
+    self._bucket_dtype = bucket_dtype
     self._serialized_tf_config = serialized_tf_config
 
     # _stamp_token is used to commit the state of the qaccumulator. In
@@ -297,7 +296,8 @@ def merge_accumulators(self, summaries):
 
   def extract_output(self, summary):
     if summary is self._empty_summary:
-      return [[[]]]
+      # Return an empty (1, 0) ndarray using np.zeros.
+      return [np.zeros(shape=(1, 0), dtype=self._bucket_dtype)]
 
     # All relevant state about the input is captured by 'summary'
     # (see comment in add_input() and merge_accumulators()).
@@ -330,11 +330,14 @@ def extract_output(self, summary):
       # Do not trim min/max, these are part of requested boundaries.
       pass
 
-    return [[buckets]]
+    # Convert to a (1, ?) shape array.
+    buckets = np.expand_dims(buckets, 0)
+
+    return [buckets]
 
 
 @with_input_types(List[np.ndarray])
-@with_output_types(List[Any])
+@with_output_types(List[np.ndarray])
 class _QuantilesAnalyzerImpl(beam.PTransform):
   """Computes the quantile buckets in a PCollection of batches."""
 
@@ -350,11 +353,12 @@ def expand(self, pcoll):
                 _ComputeQuantiles(
                     num_quantiles=self._spec.num_buckets,
                     epsilon=self._spec.epsilon,
+                    bucket_dtype=self._spec.bucket_dtype.as_numpy_dtype,
                     serialized_tf_config=serialized_tf_config)))
 
 
 @with_input_types(List[np.ndarray])
-@with_output_types(List[Any])
+@with_output_types(List[np.ndarray])
 class _CombineFnWrapper(beam.CombineFn):
   """Class to wrap a analyzers._CombinerSpec as a beam.CombineFn."""