From 473009505c5840b5fb99f9475ea3d3b2cb10ed17 Mon Sep 17 00:00:00 2001
From: Holger Kohr <h.kohr@cwi.nl>
Date: Mon, 26 Dec 2016 12:53:03 +0100
Subject: [PATCH 01/29] ENH: add numpy-style ufuncs and reductions

---
 pygpu/__init__.py |   2 +-
 pygpu/ufuncs.py   | 895 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 896 insertions(+), 1 deletion(-)
 create mode 100644 pygpu/ufuncs.py

diff --git a/pygpu/__init__.py b/pygpu/__init__.py
index 566cd8d19b..44c0766782 100644
--- a/pygpu/__init__.py
+++ b/pygpu/__init__.py
@@ -4,7 +4,7 @@ def get_include():
     assert os.path.exists(os.path.join(p, 'gpuarray_api.h'))
     return p
 
-from . import gpuarray, elemwise, reduction
+from . import gpuarray, elemwise, reduction, ufuncs
 from .gpuarray import (init, set_default_context, get_default_context,
                        array, zeros, empty, asarray, ascontiguousarray,
                        asfortranarray, register_dtype)
diff --git a/pygpu/ufuncs.py b/pygpu/ufuncs.py
new file mode 100644
index 0000000000..492d18e3b5
--- /dev/null
+++ b/pygpu/ufuncs.py
@@ -0,0 +1,895 @@
+﻿"""Ufuncs and reductions for GPU arrays."""
+
+from itertools import chain
+import numpy
+import re
+from ._array import ndgpuarray
+from .dtypes import dtype_to_ctype
+from ._elemwise import arg
+from .elemwise import as_argument, GpuElemwise
+from .reduction import reduce1
+from .gpuarray import GpuArray, array, empty, get_default_context
+
+
+# --- Helper functions --- #
+
+def restore_reduced_dims(shape, red_axes):
+    """Return tuple from ``shape`` with size-1 axes at indices ``red_axes``."""
+    newshape = list(shape)
+    try:
+        for ax in sorted(red_axes):
+            newshape.insert(ax, 1)
+    except TypeError:
+        newshape.insert(red_axes, 1)
+    return tuple(newshape)
+
+
+def reduce_dims(shape, red_axes):
+    """Return tuple from ``shape`` with ``red_axes`` removed."""
+    newshape = list(shape)
+    try:
+        for ax in reversed(sorted(red_axes)):
+            newshape.pop(ax)
+    except TypeError:
+        newshape.pop(red_axes)
+    return tuple(newshape)
+
+
+# --- Reductions with arithmetic operators --- #
+
+
+def _prepare_array_for_reduction(a, out):
+    """Return input array ready for usage in a reduction kernel."""
+    # Get a context and an array class to work with
+    need_context = True
+    for ary in (a, out):
+        if isinstance(ary, GpuArray):
+            ctx = ary.context
+            cls = ary.__class__
+            need_context = False
+            break
+    if need_context:
+        ctx = get_default_context()
+        # TODO: sensible choice? Makes sense to choose the more "feature-rich"
+        # variant here perhaps.
+        cls = ndgpuarray
+
+    # TODO: can CPU memory handed directly to kernels?
+    if not isinstance(a, GpuArray):
+        a = numpy.asarray(a)
+        if a.flags.f_contiguous and not a.flags.c_contiguous:
+            order = 'F'
+        else:
+            order = 'C'
+        a = array(a, dtype=a.dtype, copy=False, order=order, context=ctx,
+                  cls=cls)
+
+    return a
+
+
+def reduce_with_op(a, op, neutral, axis=None, dtype=None, out=None,
+                   keepdims=False):
+    """Reduce ``a`` using the operation ``op``.
+
+    This is a wrapper around `pygpu.reduction.reduce1` with signature
+    closer to NumPy and parameter ``keepdims``.
+
+    Parameters
+    ----------
+    a : `pygpu.gpuarray.GpuArray`
+        Array that should be reduced.
+    op : str
+        Operation to be used for reduction. The reduction logic is::
+
+            result = a op b
+
+    neutral : scalar
+        Neutral element of the operation fulfilling ``(n op a) == a``
+        for all ``a``. It is used as initial state of the result.
+
+    axis, dtype, out, keepdims :
+        Arguments as in NumPy reductions. See e.g. `numpy.sum`.
+
+    Returns
+    -------
+    reduced : `pygpu.gpuarray.GpuArray` or scalar
+        If not all axes are reduced over or ``out`` was given, the result is
+        an array, a reference to ``out`` if provided. Otherwise, i.e. for
+        reductions along all axes without ``out`` parameter, the result
+        is a scalar.
+    """
+    a = _prepare_array_for_reduction(a, out)
+    if a.dtype == numpy.dtype('float16') or dtype == numpy.dtype('float16'):
+        # Gives wrong results currently, see
+        # https://github.com/Theano/libgpuarray/issues/316
+        raise NotImplementedError('float16 currently broken')
+
+    if dtype is None:
+        if numpy.issubsctype(a.dtype, numpy.unsignedinteger):
+            # Avoid overflow for small integer types by default, as in
+            # Numpy
+            out_type = numpy.dtype('uint64')
+        elif numpy.issubsctype(a.dtype, numpy.integer):
+            out_type = numpy.dtype('int64')
+        else:
+            out_type = a.dtype
+    else:
+        out_type = dtype
+
+    axes = axis if axis is not None else tuple(range(a.ndim))
+    if out is not None:
+        out_arr = out.reshape(reduce_dims(a.shape, axes))
+    else:
+        out_arr = out
+
+    r = reduce1(a, op=op, neutral=neutral, out_type=out_type, axis=axis,
+                out=out_arr)
+    if keepdims:
+        newshape = restore_reduced_dims(r.shape, axes)
+        r = r.reshape(newshape)
+    if not r.shape and out is None:
+        return numpy.asarray(r).reshape([1])[0]
+    elif out is not None:
+        return out
+    else:
+        return r
+
+
+def sum(a, axis=None, dtype=None, out=None, keepdims=False):
+    """Sum of array elements over a given axis.
+
+    See Also
+    --------
+    numpy.sum
+    """
+    return reduce_with_op(a, '+', 0, axis, dtype, out, keepdims)
+
+
+def prod(a, axis=None, dtype=None, out=None, keepdims=False):
+    """Return the product of array elements over a given axis.
+
+    See Also
+    --------
+    numpy.prod
+    """
+    return reduce_with_op(a, '*', 1, axis, dtype, out, keepdims)
+
+
+# --- Reductions with comparison operators --- #
+
+def reduce_with_cmp(a, cmp, neutral, axis=None, out=None, keepdims=False):
+    """Reduce ``a`` by comparison using ``cmp``.
+
+    This is a wrapper around `pygpu.reduction.reduce1` with signature
+    closer to NumPy and parameter ``keepdims``.
+
+    Parameters
+    ----------
+    a : `pygpu.gpuarray.GpuArray`
+        Array that should be reduced.
+    cmp : str
+        Comparison operator to be used for reduction. The reduction
+        logic is::
+
+            result = (a cmp b) ? a : b
+
+    neutral : scalar
+        Neutral element of the comparison fulfilling ``(n cmp a) == True``
+        for all ``a``. It is used as initial state of the result.
+
+    axis, out, keepdims :
+        Arguments as in NumPy reductions. See e.g. `numpy.amax`.
+
+    Returns
+    -------
+    reduced : `pygpu.gpuarray.GpuArray` or scalar
+        If not all axes are reduced over or ``out`` was given, the result is
+        an array, a reference to ``out`` if provided. Otherwise, i.e. for
+        reductions along all axes without ``out`` parameter, the result
+        is a scalar.
+    """
+    a = _prepare_array_for_reduction(a, out)
+    if a.dtype == numpy.dtype('float16'):
+        # Gives wrong results currently, see
+        # https://github.com/Theano/libgpuarray/issues/316
+        raise NotImplementedError('float16 currently broken')
+
+    axes = axis if axis is not None else tuple(range(a.ndim))
+    if out is not None:
+        out_arr = out.reshape(reduce_dims(a.shape, axes))
+    else:
+        out_arr = out
+
+    oper = '(a {} b) ? a : b'.format(cmp)
+    r = reduce1(a, op=None, neutral=neutral, out_type=a.dtype, axis=axis,
+                out=out_arr, oper=oper)
+    if keepdims:
+        axes = axis if axis is not None else tuple(range(a.ndim))
+        newshape = restore_reduced_dims(r.shape, axes)
+        r = r.reshape(newshape)
+    if not r.shape and out is None:
+        return numpy.asarray(r).reshape([1])[0]
+    elif out is not None:
+        return out
+    else:
+        return r
+
+
+def amin(a, axis=None, out=None, keepdims=False):
+    """Return the minimum of an array or minimum along an axis.
+
+    See Also
+    --------
+    numpy.amin
+    """
+    a = _prepare_array_for_reduction(a, out)
+    if numpy.issubsctype(a.dtype, numpy.integer):
+        neutral = numpy.iinfo(a.dtype).max
+    elif numpy.issubsctype(a.dtype, numpy.floating):
+        neutral = numpy.finfo(a.dtype).max
+    elif numpy.issubsctype(a.dtype, numpy.complexfloating):
+        raise ValueError("array dtype '{}' not comparable"
+                         "".format(a.dtype.name))
+    else:
+        raise ValueError("array dtype '{}' not supported"
+                         "".format(a.dtype.name))
+    return reduce_with_cmp(a, '<', neutral, axis, out, keepdims)
+
+
+def amax(a, axis=None, out=None, keepdims=False):
+    """Return the maximum of an array or minimum along an axis.
+
+    See Also
+    --------
+    numpy.amax
+    """
+    a = _prepare_array_for_reduction(a, out)
+    if numpy.issubsctype(a.dtype, numpy.integer):
+        neutral = numpy.iinfo(a.dtype).min
+    elif numpy.issubsctype(a.dtype, numpy.floating):
+        neutral = numpy.finfo(a.dtype).min
+    elif numpy.issubsctype(a.dtype, numpy.complexfloating):
+        raise ValueError("array dtype '{}' not comparable"
+                         "".format(a.dtype.name))
+    else:
+        raise ValueError("array dtype '{}' not supported"
+                         "".format(a.dtype.name))
+    return reduce_with_cmp(a, '>', neutral, axis, out, keepdims)
+
+
+# --- Elementwise ufuncs --- #
+
+
+UNARY_C_FUNC_TO_UFUNC = {
+    'abs': 'absolute',
+    'acos': 'arccos',
+    'acosh': 'arccosh',
+    'asin': 'arcsin',
+    'asinh': 'arcsinh',
+    'atan': 'arctan',
+    'atanh': 'arctanh',
+    'cbrt': 'cbrt',
+    'ceil': 'ceil',
+    'cos': 'cos',
+    'cosh': 'cosh',
+    'exp': 'exp',
+    'exp2': 'exp2',
+    'expm1': 'expm1',
+    'fabs': 'fabs',
+    'floor': 'floor',
+    'log': 'log',
+    'log10': 'log10',
+    'log1p': 'log1p',
+    'log2': 'log2',
+    'rint': 'rint',
+    'sin': 'sin',
+    'sinh': 'sinh',
+    'sqrt': 'sqrt',
+    'tan': 'tan',
+    'tanh': 'tanh',
+    'trunc': 'trunc',
+    }
+
+UNARY_UFUNC_TO_C_FUNC = {v: k for k, v in UNARY_C_FUNC_TO_UFUNC.items()}
+
+UNARY_UFUNC_TO_C_OP = {
+    'bitwise_not': '~',
+    'logical_not': '!',
+    'negative': '-',
+    }
+
+UNARY_UFUNCS = (list(UNARY_UFUNC_TO_C_FUNC.keys()) +
+                list(UNARY_UFUNC_TO_C_OP.keys()))
+UNARY_UFUNCS.extend(['deg2rad', 'rad2deg', 'reciprocal', 'sign', 'signbit',
+                     'square'])
+
+BINARY_C_FUNC_TO_UFUNC = {
+    'atan2': 'arctan2',
+    'copysign': 'copysign',
+    'hypot': 'hypot',
+    'ldexp': 'ldexp',
+    'pow': 'power',
+    'nextafter': 'nextafter',
+    'fmod': 'fmod',
+}
+
+BINARY_UFUNC_TO_C_FUNC = {v: k for k, v in BINARY_C_FUNC_TO_UFUNC.items()}
+
+BINARY_UFUNC_TO_C_OP = {
+    'add': '+',
+    'bitwise_and': '&',
+    'bitwise_or': '|',
+    'bitwise_xor': '^',
+    'equal': '==',
+    'greater': '>',
+    'greater_equal': '>=',
+    'left_shift': '<<',
+    'less': '<',
+    'less_equal': '<=',
+    'logical_and': '&&',
+    'logical_or': '||',
+    'multiply': '*',
+    'not_equal': '!=',
+    'right_shift': '>>',
+    'subtract': '-',
+    }
+
+BINARY_UFUNCS = (list(BINARY_UFUNC_TO_C_FUNC.keys()) +
+                 list(BINARY_UFUNC_TO_C_OP.keys()))
+BINARY_UFUNCS.extend(['floor_divide', 'true_divide', 'logical_xor',
+                      'maximum', 'minimum', 'remainder'])
+
+
+def ufunc_dtypes(ufunc_name, dtypes_in):
+    """Return result dtypes for a ufunc and input dtypes.
+
+    Parameters
+    ----------
+    ufunc_name : str
+        Name of the Numpy ufunc.
+    dtypes_in : sequence of `numpy.dtype`
+        Data types of the arrays.
+
+    Returns
+    -------
+    prom_in_dtypes : tuple of `numpy.dtype`
+        Promoted input dtypes, different from ``dtypes_in`` if type
+        promotion is necessary for the ufunc.
+    result_dtypes : tuple of `numpy.dtype`
+        Resulting data types of the ufunc. If a function is not suited
+        for integers, the dtype is promoted to the smallest possible
+        supported floating point data type.
+
+    Examples
+    --------
+    Real floating point types:
+
+    >>> ufunc_dtypes('absolute', [numpy.dtype('float64')])
+    ((dtype('float64'),), (dtype('float64'),))
+    >>> ufunc_dtypes('absolute', [numpy.dtype('float32')])
+    ((dtype('float32'),), (dtype('float32'),))
+    >>> ufunc_dtypes('power', [numpy.dtype('float32'), numpy.dtype('float64')])
+    ((dtype('float64'), dtype('float64')), (dtype('float64'),))
+    >>> ufunc_dtypes('power', [numpy.dtype('float32'), numpy.dtype('float32')])
+    ((dtype('float32'), dtype('float32')), (dtype('float32'),))
+
+    Integer types -- some functions produce integer results, others
+    need to convert to floating point:
+
+    >>> ufunc_dtypes('absolute', [numpy.dtype('int8')])
+    ((dtype('int8'),), (dtype('int8'),))
+    >>> ufunc_dtypes('absolute', [numpy.dtype('int16')])
+    ((dtype('int16'),), (dtype('int16'),))
+    >>> ufunc_dtypes('exp', [numpy.dtype('int8')])
+    ((dtype('float32'),), (dtype('float32'),))
+    >>> ufunc_dtypes('exp', [numpy.dtype('int16')])
+    ((dtype('float32'),), (dtype('float32'),))
+    >>> ufunc_dtypes('exp', [numpy.dtype('int32')])
+    ((dtype('float64'),), (dtype('float64'),))
+    >>> ufunc_dtypes('power', [numpy.dtype('int8'), numpy.dtype('int8')])
+    ((dtype('int8'), dtype('int8')), (dtype('int8'),))
+    >>> ufunc_dtypes('power', [numpy.dtype('int8'), numpy.dtype('float32')])
+    ((dtype('float32'), dtype('float32')), (dtype('float32'),))
+    """
+    npy_ufunc = getattr(numpy, ufunc_name)
+
+    # TODO: get supported dtypes from some place within libgpuarray?
+    dtypes = [numpy.dtype(dt)
+              for dt in chain(numpy.sctypes['int'], numpy.sctypes['uint'],
+                              numpy.sctypes['float'])
+              if numpy.dtype(dt) not in [numpy.dtype('float16'),
+                                         numpy.dtype('float128'),
+                                         numpy.dtype('complex256')]]
+
+    # Filter for dtypes larger than our input types
+    def larger_eq_than_dtypes(sig):
+        from_part = sig.split('->')[0]
+        if len(from_part) != len(dtypes_in):
+            return False
+        else:
+            dts = tuple(numpy.dtype(c) for c in from_part)
+            # Currently unsupported, filtering out
+            if any(dt not in dtypes for dt in dts):
+                return False
+            else:
+                return all(dt >= dt_in for dt, dt_in in zip(dts, dtypes_in))
+
+    larger_sig_list = list(filter(larger_eq_than_dtypes, npy_ufunc.types))
+    if not larger_sig_list:
+        # TODO: Numpy raises TypeError for bad data types, which is wrong,
+        # but we mirror that behavior
+        raise TypeError('data types {} not supported for ufunc {}'
+                        ''.format(tuple(dt.name for dt in dtypes_in),
+                                  ufunc_name))
+
+    # Key function for signature comparison
+    def from_part_key(sig):
+        from_part = sig.split('->')[0]
+        return tuple(numpy.dtype(c) for c in from_part)
+
+    # Get the smallest signature larger than our input dtypes
+    smallest_sig = min(larger_sig_list, key=from_part_key)
+    smallest_str_in, smallest_str_out = smallest_sig.split('->')
+    prom_dtypes = tuple(numpy.dtype(c) for c in smallest_str_in)
+    result_dtypes = tuple(numpy.dtype(c) for c in smallest_str_out)
+
+    # Quad precision unsupported also on output side
+    if any(dt in result_dtypes for dt in (numpy.dtype('float16'),
+                                          numpy.dtype('float128'))):
+        # TODO: Numpy raises TypeError for bad data types, which is wrong,
+        # but we mirror that behavior
+        raise TypeError('data types {} not supported for ufunc {}'
+                        ''.format(tuple(dt.name for dt in dtypes_in),
+                                  ufunc_name))
+
+    return prom_dtypes, result_dtypes
+
+
+def ufunc_c_fname(ufunc_name, dtypes_in):
+    """Return C function name for a ufunc and input dtypes.
+
+    The names are only specialized for complex math and to distinguish
+    between functions for absolute value. There is no name-mangling for
+    specific precisions.
+
+    Parameters
+    ----------
+    ufunc_name : str
+        Name of the Numpy ufunc.
+    dtype_in : sequence of `numpy.dtype`
+        Data types of input arrays.
+
+    Returns
+    -------
+    cname : str
+        C name of the ufunc.
+
+    Examples
+    --------
+    >>> ufunc_c_fname('exp', [numpy.dtype('float32')])
+    'exp'
+    >>> ufunc_c_fname('exp', [numpy.dtype('int16')])
+    'exp'
+    >>> ufunc_c_fname('absolute', [numpy.dtype('float64')])
+    'fabs'
+    >>> ufunc_c_fname('absolute', [numpy.dtype('float32')])
+    'fabs'
+    >>> ufunc_c_fname('absolute', [numpy.dtype('int8')])
+    'abs'
+    >>> ufunc_c_fname('power',
+    ...               [numpy.dtype('float32'), numpy.dtype('float64')])
+    'pow'
+
+    See Also
+    --------
+    ufunc_result_dtype : Get dtype of a ufunc result.
+    """
+    _, result_dtypes = ufunc_dtypes(ufunc_name, dtypes_in)
+    result_dtype = result_dtypes[0]
+    c_base_name = UNARY_UFUNC_TO_C_FUNC.get(ufunc_name, None)
+    if c_base_name is None:
+        c_base_name = BINARY_UFUNC_TO_C_FUNC.get(ufunc_name)
+
+    if c_base_name == 'abs':
+        if numpy.issubsctype(dtypes_in[0], numpy.floating):
+            c_base_name = 'fabs'
+
+    if numpy.issubsctype(result_dtype, numpy.complexfloating):
+        prefix = 'c'
+        # Currently broken
+        raise NotImplementedError('complex math kernels currently not '
+                                  'available')
+    else:
+        prefix = ''
+
+    return prefix + c_base_name
+
+
+def unary_ufunc(a, ufunc_name, out=None):
+    """Call a unary ufunc on an array ``a``.
+
+    Parameters
+    ----------
+    a : `array-like`
+        Input array.
+    ufunc_name : str
+        Name of the NumPy ufunc to be called on ``a``.
+    out : `pygpu.gpuarray.GpuArray`, optional
+        Array in which to store the result. Its shape must be equal to
+        ``a.shape`` and its dtype must be the result dtype of the called
+        function.
+        If ``out=None`` and ``a`` is not a GPU array, a default
+        GPU context must have been set.
+
+    Returns
+    -------
+    out : `pygpu.gpuarray.GpuArray`
+        Result of the computation. If ``out`` was given, the returned
+        object is a reference to it. If ``a`` was not a GPU arrays, the
+        type of ``out`` is `pygpu._array.ndgpuarray`.
+
+    See Also
+    --------
+    ufunc_result_dtype : Get dtype of a ufunc result.
+    ufunc_c_funcname : Get C name for math ufuncs.
+    """
+    # Get a context and an array class to work with
+    need_context = True
+    for ary in (a, out):
+        if isinstance(ary, GpuArray):
+            ctx = ary.context
+            cls = ary.__class__
+            need_context = False
+            break
+    if need_context:
+        ctx = get_default_context()
+        # TODO: sensible choice? Makes sense to choose the more "feature-rich"
+        # variant here perhaps.
+        cls = ndgpuarray
+
+    # TODO: can CPU memory handed directly to kernels?
+    if not isinstance(a, GpuArray):
+        a = numpy.asarray(a)
+        if a.flags.f_contiguous and not a.flags.c_contiguous:
+            order = 'F'
+        else:
+            order = 'C'
+        a = array(a, dtype=a.dtype, copy=False, order=order, context=ctx,
+                  cls=cls)
+
+    if a.dtype == numpy.dtype('float16'):
+        # Gives wrong results currently, see
+        # https://github.com/Theano/libgpuarray/issues/316
+        raise NotImplementedError('float16 currently broken')
+
+    prom_dtypes_in, result_dtypes = ufunc_dtypes(ufunc_name, [a.dtype])
+    prom_dtype_in = prom_dtypes_in[0]
+    result_dtype = result_dtypes[0]
+
+    # This is the "fallback signature" case, for us it signals failure.
+    # TypeError is what Numpy raises, too, which is kind of wrong
+    if prom_dtype_in == numpy.dtype(object):
+        raise TypeError("input dtype '{}' invalid for ufunc '{}'"
+                        "".format(a.dtype.name, ufunc_name))
+
+    # Convert input such that the kernel runs
+    # TODO: can this be avoided?
+    a = a.astype(prom_dtype_in, copy=False)
+
+    if out is None:
+        out = empty(a.shape, dtype=result_dtype, context=ctx, cls=cls)
+    else:
+        # TODO: allow larger dtype
+        if out.dtype != result_dtype:
+            raise ValueError("`out.dtype` != result dtype ('{}' != '{}')"
+                             "".format(out.dtype.name, result_dtype.name))
+
+    # C result dtype for casting
+    # TODO: necessary? overly cautious?
+    c_res_dtype = dtype_to_ctype(result_dtype)
+
+    oper = ''
+
+    # Case 1: math function
+    if ufunc_name in UNARY_UFUNC_TO_C_FUNC:
+        c_func = ufunc_c_fname(ufunc_name, (a.dtype,))
+        oper = 'res = ({}) {}(a)'.format(c_res_dtype, c_func)
+
+    # Case 2: unary operator
+    unop = UNARY_UFUNC_TO_C_OP.get(ufunc_name, None)
+    if unop is not None:
+        oper = 'res = ({}) {}a'.format(c_res_dtype, unop)
+
+    # Other cases: specific functions
+    if ufunc_name == 'deg2rad':
+        oper = 'res = ({rdt})({}) * ({rdt}) a'.format(numpy.deg2rad(1),
+                                                      rdt=c_res_dtype)
+
+    if ufunc_name == 'rad2deg':
+        oper = 'res = ({rdt})({}) * ({rdt}) a'.format(numpy.rad2deg(1),
+                                                      rdt=c_res_dtype)
+
+    if ufunc_name == 'reciprocal':
+        oper = 'res = ({}) (1.0) / a'.format(c_res_dtype)
+
+    if ufunc_name == 'sign':
+        oper = 'res = ({}) ((a > 0) ? 1 : (a < 0) ? -1 : 0)'.format(
+            c_res_dtype)
+
+    if ufunc_name == 'signbit':
+        oper = 'res = ({}) (a < 0)'.format(c_res_dtype)
+
+    if ufunc_name == 'square':
+        oper = 'res = ({}) (a * a)'.format(c_res_dtype)
+
+    if not oper:
+        raise ValueError("`ufunc_name` '{}' does not represent a unary ufunc"
+                         "".format(ufunc_name))
+
+    a_arg = as_argument(a, 'a', read=True)
+    args = [arg('res', out.dtype, write=True), a_arg]
+
+    ker = GpuElemwise(ctx, oper, args, convert_f16=True)
+    ker(out, a)
+    return out
+
+
+def binary_ufunc(a, b, ufunc_name, out=None):
+    """Call binary ufunc on ``a`` and ``b``.
+
+    Parameters
+    ----------
+    a, b : `array-like`
+        Input arrays.
+    ufunc_name : str
+        Name of the NumPy ufunc to be called on ``a`` and ``b``.
+    out : `pygpu.gpuarray.GpuArray`, optional
+        Array in which to store the result. Its shape must be equal to
+        ``a.shape`` and its dtype must be the result dtype of the called
+        function.
+        If ``out=None`` and ``a, b`` are both not GPU arrays, a default
+        GPU context must have been set.
+
+    Returns
+    -------
+    out : `pygpu.gpuarray.GpuArray`
+        Result of the computation. If ``out`` was given, the returned
+        object is a reference to it. If both input arrays were not
+        GPU arrays, the type of ``out`` is `pygpu._array.ndgpuarray`.
+
+    See Also
+    --------
+    pygpu.gpuarray.set_default_context
+    """
+    # Get a context and an array class to work with
+    need_context = True
+    for ary in (a, b, out):
+        if isinstance(a, GpuArray):
+            ctx = ary.context
+            cls = ary.__class__
+            need_context = False
+            break
+    if need_context:
+        ctx = get_default_context()
+        # TODO: sensible choice? Makes sense to choose the more "feature-rich"
+        # variant here perhaps.
+        cls = ndgpuarray
+
+    # TODO: can CPU memory handed directly to kernels?
+    if not isinstance(a, GpuArray):
+        a = numpy.asarray(a)
+        if a.flags.f_contiguous and not a.flags.c_contiguous:
+            order = 'F'
+        else:
+            order = 'C'
+        a = array(a, dtype=a.dtype, copy=False, order=order, context=ctx,
+                  cls=cls)
+    if not isinstance(b, GpuArray):
+        b = numpy.asarray(b)
+        if b.flags.f_contiguous and not b.flags.c_contiguous:
+            order = 'F'
+        else:
+            order = 'C'
+        b = array(b, dtype=b.dtype, copy=False, order=order, context=ctx,
+                  cls=cls)
+
+    if any(ary.dtype == numpy.dtype('float16') for ary in (a, b)):
+        # Gives wrong results currently, see
+        # https://github.com/Theano/libgpuarray/issues/316
+        raise NotImplementedError('float16 currently broken')
+
+    prom_dtypes_in, result_dtypes = ufunc_dtypes(ufunc_name,
+                                                 [a.dtype, b.dtype])
+    prom_dtype_a, prom_dtype_b = prom_dtypes_in
+    result_dtype = result_dtypes[0]
+
+    # This is the "fallback signature" case, for us it signals failure
+    if any(dt == numpy.dtype(object) for dt in prom_dtypes_in):
+        raise TypeError("input dtypes {} invalid for ufunc '{}'"
+                        "".format((a.dtype.name, b.dtype.name), ufunc_name))
+
+    # Convert input such that the kernel runs
+    # TODO: can this be avoided?
+    a = a.astype(prom_dtype_a, copy=False)
+    b = b.astype(prom_dtype_b, copy=False)
+
+    if a.ndim != b.ndim:
+        nd = max(a.ndim, b.ndim)
+        if a.ndim < nd:
+            a = a.reshape(((1,) * (nd - a.ndim)) + a.shape)
+        if b.ndim < nd:
+            b = b.reshape(((1,) * (nd - b.ndim)) + b.shape)
+    result_shape = tuple(max(sa, sb) for sa, sb in zip(a.shape, b.shape))
+
+    if out is None:
+        out = empty(result_shape, dtype=result_dtype, context=ctx, cls=cls)
+    else:
+        if out.shape != result_shape:
+            raise ValueError('`out.shape` != result shape ({} != {})'
+                             ''.format(out.shape, result_shape))
+        if out.dtype != result_dtype:
+            raise ValueError("`out.dtype` != result dtype ('{}' != '{}')"
+                             ''.format(out.dtype.name, result_dtype.name))
+
+    a_arg = as_argument(a, 'a', read=True)
+    b_arg = as_argument(b, 'b', read=True)
+    args = [arg('res', result_dtype, write=True), a_arg, b_arg]
+
+    # C result dtype for casting
+    c_res_dtype = dtype_to_ctype(result_dtype)
+
+    # Set string for mapping operation
+    oper = ''
+
+    # Case 1: math function
+    if ufunc_name in BINARY_UFUNC_TO_C_FUNC:
+        if ufunc_name == 'power':
+            # Arguments to `pow` cannot be integer, need to cast
+            if numpy.issubsctype(result_dtype, numpy.integer):
+                tpl = 'res = ({rt}) (long) (pow((double) a, (double) b) + 0.5)'
+                oper = tpl.format(rt=c_res_dtype)
+            else:
+                oper = 'res = ({rt}) pow(({rt}) a, ({rt}) b)'.format(
+                    rt=c_res_dtype)
+
+        elif ufunc_name == 'fmod':
+            # Arguments to `fmod` cannot be integer, need to cast
+            if numpy.issubsctype(result_dtype, numpy.integer):
+                oper = 'res = ({rt}) fmod((double) a, (double) b)'.format(
+                    rt=c_res_dtype)
+            else:
+                oper = 'res = ({rt}) fmod(({rt}) a, ({rt}) b)'.format(
+                    rt=c_res_dtype)
+
+        else:
+            c_fname = ufunc_c_fname(ufunc_name, [a.dtype, b.dtype])
+            oper = 'res = ({}) {}(a, b)'.format(c_res_dtype, c_fname)
+
+    # Case 2: binary operator
+    binop = BINARY_UFUNC_TO_C_OP.get(ufunc_name, None)
+    if binop is not None:
+        oper = 'res = ({}) (a {} b)'.format(c_res_dtype, binop)
+    else:
+        # Other cases: specific functions
+        if ufunc_name == 'floor_divide':
+            oper = 'res = ({}) (long)(a / b)'.format(c_res_dtype)
+
+        if ufunc_name == 'true_divide':
+            if result_dtype == numpy.dtype('float64'):
+                flt = 'double'
+            else:
+                flt = 'float'
+            oper = 'res = ({}) (({flt}) a / ({flt}) b)'.format(c_res_dtype,
+                                                               flt=flt)
+
+        elif ufunc_name == 'logical_xor':
+            oper = 'res = ({}) (a ? !b : b)'.format(c_res_dtype)
+
+        elif ufunc_name == 'maximum':
+            oper = 'res = ({}) ((a > b) ? a : b)'.format(c_res_dtype)
+
+        elif ufunc_name == 'minimum':
+            oper = 'res = ({}) ((a < b) ? a : b)'.format(c_res_dtype)
+
+        elif ufunc_name == 'remainder':
+            # Cannot use `out` in the first call since the dtype may differ
+            # from `result_dtype`, and `copysign` has no integer signature.
+            # TODO: doable in one call?
+            res = binary_ufunc(a, b, 'fmod')
+            res = binary_ufunc(res, b, 'copysign')
+            out[:] = res.astype(result_dtype)
+            return out
+
+    if not oper:
+        raise ValueError("`ufunc_name` '{}' does not represent a binary "
+                         "ufunc".format(ufunc_name))
+
+    kernel = GpuElemwise(a.context, oper, args, convert_f16=True)
+    kernel(out, a, b, broadcast=True)
+    return out
+
+
+MISSING_UFUNCS = [
+    'conjugate',  # no complex dtype yet
+    'frexp',  # multiple output values, how to handle that?
+    'isfinite',  # how to test in C?
+    'isinf',  # how to test in C?
+    'isnan',  # how to test in C?
+    'logaddexp',  # not a one-liner (at least not in numpy)
+    'logaddexp2',  # not a one-liner (at least not in numpy)
+    'modf',  # multiple output values, how to handle that?
+    'spacing',  # implementation?
+    ]
+
+
+UFUNC_SYNONYMS = [
+    ('absolute', 'abs'),
+    # ('conjugate', 'conj'),
+    ('deg2rad', 'degrees'),
+    ('rad2deg', 'radians'),
+    ('true_divide', 'divide'),
+    ('maximum', 'fmax'),  # differ in NaN propagation in numpy, doable?
+    ('minimum', 'fmin'),  # differ in NaN propagation in numpy, doable?
+    ('bitwise_not', 'invert'),
+    ('remainder', 'mod')
+    ]
+
+
+def make_unary_ufunc(name, doc):
+    def wrapper(a, out=None):
+        return unary_ufunc(a, name, out)
+    wrapper.__qualname__ = wrapper.__name__ = name
+    wrapper.__doc__ = doc
+    return wrapper
+
+
+# Add the ufuncs to the module dictionary
+for ufunc_name in UNARY_UFUNCS:
+    npy_ufunc = getattr(numpy, ufunc_name)
+    descr = npy_ufunc.__doc__.splitlines()[2]
+    # Numpy occasionally uses single ticks for doc, we only use them for links
+    descr = re.sub('`+', '``', descr)
+    doc = descr + """
+
+See Also
+--------
+numpy.{}
+""".format(ufunc_name)
+
+    globals()[ufunc_name] = make_unary_ufunc(ufunc_name, doc)
+
+
+def make_binary_ufunc(name, doc):
+    def wrapper(a, b, out=None):
+        return binary_ufunc(a, b, name, out)
+
+    wrapper.__qualname__ = wrapper.__name__ = name
+    wrapper.__doc__ = doc
+    return wrapper
+
+
+for ufunc_name in BINARY_UFUNCS:
+    npy_ufunc = getattr(numpy, ufunc_name)
+    descr = npy_ufunc.__doc__.splitlines()[2]
+    # Numpy occasionally uses single ticks for doc, we only use them for links
+    descr = re.sub('`+', '``', descr)
+    doc = descr + """
+
+See Also
+--------
+numpy.{}
+""".format(ufunc_name)
+
+    globals()[ufunc_name] = make_binary_ufunc(ufunc_name, doc)
+
+
+for name, alt_name in UFUNC_SYNONYMS:
+    globals()[alt_name] = globals()[name]
+
+
+if __name__ == '__main__':
+    # pylint: disable=wrong-import-position
+    from doctest import testmod, NORMALIZE_WHITESPACE
+    import numpy
+    optionflags = NORMALIZE_WHITESPACE
+    testmod(optionflags=NORMALIZE_WHITESPACE, extraglobs={'np': numpy})

From baf0d9bd909aa5535681986dffb90896b4641995 Mon Sep 17 00:00:00 2001
From: Holger Kohr <h.kohr@cwi.nl>
Date: Mon, 26 Dec 2016 16:40:38 +0100
Subject: [PATCH 02/29] TST: add tests for ufuncs and reductions

---
 pygpu/tests/test_ufuncs.py | 218 +++++++++++++++++++++++++++++++++++++
 1 file changed, 218 insertions(+)
 create mode 100644 pygpu/tests/test_ufuncs.py

diff --git a/pygpu/tests/test_ufuncs.py b/pygpu/tests/test_ufuncs.py
new file mode 100644
index 0000000000..61e06b2d59
--- /dev/null
+++ b/pygpu/tests/test_ufuncs.py
@@ -0,0 +1,218 @@
+from itertools import chain
+import numpy
+from unittest import TestCase
+
+import pygpu
+from pygpu import ufuncs
+from pygpu.ufuncs import UNARY_UFUNCS, BINARY_UFUNCS
+from .support import context
+
+
+pygpu.set_default_context(context)
+numpy.seterr(invalid='ignore', divide='ignore')  # avoid stdout warnings
+
+
+# --- Helper functions and global definitions --- #
+
+def npy_and_gpuary_arrays(shape, dtype, positive=False):
+    """Return Numpy and GPU arrays of given shape and dtype."""
+    if numpy.issubsctype(dtype, numpy.unsignedinteger):
+        npy_arr = numpy.random.randint(1, 10, size=shape, dtype=dtype)
+    elif numpy.issubsctype(dtype, numpy.integer):
+        low = 1 if positive else -10
+        npy_arr = numpy.random.randint(low, 10, size=shape, dtype=dtype)
+    elif numpy.issubsctype(dtype, numpy.floating):
+        npy_arr = numpy.random.normal(size=shape).astype(dtype)
+        if positive:
+            npy_arr = numpy.abs(npy_arr) + 0.1
+    elif numpy.issubsctype(dtype, numpy.complexfloating):
+        npy_arr_re = numpy.random.normal(size=shape).astype(dtype)
+        npy_arr_im = numpy.random.normal(size=shape).astype(dtype)
+        npy_arr = npy_arr_re + 1j * npy_arr_im
+        if positive:
+            npy_arr = (numpy.abs(npy_arr) + 0.1).astype(dtype)
+    else:
+        assert False
+
+    gpuary_arr = pygpu.array(npy_arr, dtype=dtype)
+    return npy_arr, gpuary_arr
+
+
+reductions = ['sum', 'prod', 'amin', 'amax']
+axis_params = [None, 0, 2, (1, 2), (0,), (0, 1), (0, 1, 2)]
+dtypes = [numpy.dtype(dt)
+          for dt in chain(numpy.sctypes['int'], numpy.sctypes['uint'],
+                          numpy.sctypes['float'])
+          if numpy.dtype(dt) not in [numpy.dtype('float16'),
+                                     numpy.dtype('float128'),
+                                     numpy.dtype('complex256')]]
+
+# --- Ufuncs & Reductions --- #
+
+
+class test_reduction(TestCase):
+
+    def test_all(self):
+        for reduction in reductions:
+            for axis in axis_params:
+                for keepdims in [True, False]:
+                    for dtype in dtypes:
+                        self.check_reduction(reduction, dtype, axis, keepdims)
+
+    def check_reduction(self, reduction, dtype, axis, keepdims):
+        """Test GpuArray reduction against equivalent Numpy result."""
+        gpuary_reduction = getattr(ufuncs, reduction)
+        npy_reduction = getattr(numpy, reduction)
+
+        npy_arr, gpuary_arr = npy_and_gpuary_arrays(shape=(2, 3, 4),
+                                                    dtype=dtype)
+        # Determine relative tolerance from dtype
+        try:
+            res = numpy.finfo(dtype).resolution
+        except ValueError:
+            res = 0
+        rtol = 2 * npy_arr.size * res
+
+        if (numpy.issubsctype(dtype, numpy.complexfloating) and
+                reduction in ('amin', 'amax')):
+            with self.assertRaises(ValueError):
+                gpuary_reduction(gpuary_arr, axis=axis, keepdims=keepdims)
+            return
+
+        # No explicit out dtype
+        npy_result = npy_reduction(npy_arr, axis=axis, keepdims=keepdims)
+        gpuary_result = gpuary_reduction(gpuary_arr, axis=axis,
+                                         keepdims=keepdims)
+        if numpy.isscalar(npy_result):
+            assert numpy.isscalar(gpuary_result)
+            assert numpy.isclose(gpuary_result, npy_result, rtol=rtol,
+                                 atol=rtol)
+        else:
+            assert npy_result.shape == gpuary_result.shape
+            assert npy_result.dtype == gpuary_result.dtype
+            assert numpy.allclose(npy_result, gpuary_result, rtol=rtol,
+                                  atol=rtol)
+
+        # With out array
+        out = pygpu.empty(npy_result.shape, npy_result.dtype)
+        gpuary_result = gpuary_reduction(gpuary_arr, axis=axis, out=out,
+                                         keepdims=keepdims)
+        assert numpy.allclose(npy_result, out, rtol=rtol, atol=rtol)
+        assert out is gpuary_result
+
+        # Explicit out dtype, supported by some reductions only
+        if reduction in ('sum', 'prod'):
+            out_dtype = numpy.promote_types(dtype, numpy.float32)
+            gpuary_result = gpuary_reduction(gpuary_arr, axis=axis,
+                                             dtype=out_dtype,
+                                             keepdims=keepdims)
+            assert gpuary_result.dtype == out_dtype
+            assert numpy.allclose(npy_result, out, rtol=rtol, atol=rtol)
+
+        # Using numpy arrays as input
+        gpuary_result = gpuary_reduction(npy_arr, axis=axis, keepdims=keepdims)
+        if numpy.isscalar(npy_result):
+            assert numpy.isscalar(gpuary_result)
+            assert numpy.isclose(gpuary_result, npy_result, rtol=rtol,
+                                 atol=rtol)
+        else:
+            assert npy_result.shape == gpuary_result.shape
+            assert npy_result.dtype == gpuary_result.dtype
+            assert numpy.allclose(npy_result, gpuary_result, rtol=rtol,
+                                  atol=rtol)
+
+
+class test_unary_ufuncs(TestCase):
+
+    def test_all(self):
+        for ufunc in UNARY_UFUNCS:
+            for dtype in dtypes:
+                self.check_unary_ufunc(ufunc, dtype)
+
+    def check_unary_ufunc(self, ufunc, dtype):
+        """Test GpuArray unary ufuncs against equivalent Numpy results."""
+        gpuary_ufunc = getattr(ufuncs, ufunc)
+        npy_ufunc = getattr(numpy, ufunc)
+
+        npy_arr, gpuary_arr = npy_and_gpuary_arrays(shape=(2, 3), dtype=dtype,
+                                                    positive=True)
+        try:
+            res = numpy.finfo(dtype).resolution
+        except ValueError:
+            res = numpy.finfo(numpy.promote_types(dtype,
+                                                  numpy.float16)).resolution
+        rtol = 10 * res
+
+        try:
+            npy_result = npy_ufunc(npy_arr)
+        except TypeError:
+            # Make sure we raise the same error as Numpy
+            with self.assertRaises(TypeError):
+                gpuary_ufunc(gpuary_arr)
+        else:
+            if npy_result.dtype == numpy.dtype('float16'):
+                # We use float32 as minimum for GPU arrays, do the same here
+                npy_result = npy_result.astype('float32')
+
+            gpuary_result = gpuary_ufunc(gpuary_arr)
+            assert npy_result.shape == gpuary_result.shape
+            assert npy_result.dtype == gpuary_result.dtype
+            assert numpy.allclose(npy_result, gpuary_result, rtol=rtol,
+                                  atol=rtol, equal_nan=True)
+
+            # In-place
+            out = pygpu.empty(npy_result.shape, npy_result.dtype)
+            gpuary_ufunc(gpuary_arr, out)
+            assert numpy.allclose(npy_result, gpuary_result, rtol=rtol,
+                                  equal_nan=True)
+
+
+class test_binary_ufuncs(TestCase):
+
+    def test_all(self):
+        for ufunc in BINARY_UFUNCS:
+            for dtype1 in dtypes:
+                for dtype2 in dtypes:
+                    self.check_binary_ufunc(ufunc, dtype1, dtype2)
+
+    def check_binary_ufunc(self, ufunc, dtype1, dtype2):
+        """Test GpuArray binary ufunc against equivalent Numpy result."""
+        gpuary_ufunc = getattr(ufuncs, ufunc)
+        npy_ufunc = getattr(numpy, ufunc)
+
+        npy_arr, gpuary_arr = npy_and_gpuary_arrays(shape=(2, 3),
+                                                    dtype=dtype1,
+                                                    positive=True)
+        npy_arr2, gpuary_arr2 = npy_and_gpuary_arrays(shape=(2, 3),
+                                                      dtype=dtype2,
+                                                      positive=True)
+
+        try:
+            res = numpy.finfo(numpy.result_type(dtype1, dtype2)).resolution
+        except ValueError:
+            res = numpy.finfo(numpy.result_type(numpy.float16,
+                                                dtype1, dtype2)).resolution
+        rtol = 10 * res
+
+        try:
+            npy_result = npy_ufunc(npy_arr, npy_arr2)
+        except TypeError:
+            # Make sure we raise the same error as Numpy
+            with self.assertRaises(TypeError):
+                gpuary_ufunc(gpuary_arr, gpuary_arr2)
+        else:
+            if npy_result.dtype == numpy.dtype('float16'):
+                # We use float32 as minimum for GPU arrays, do the same here
+                npy_result = npy_result.astype('float32')
+
+            gpuary_result = gpuary_ufunc(gpuary_arr, gpuary_arr2)
+            assert npy_result.shape == gpuary_result.shape
+            assert npy_result.dtype == gpuary_result.dtype
+            assert numpy.allclose(npy_result, gpuary_result, rtol=rtol,
+                                  atol=rtol, equal_nan=True)
+
+            # In-place
+            out = pygpu.empty(npy_result.shape, npy_result.dtype)
+            gpuary_ufunc(gpuary_arr, gpuary_arr2, out)
+            assert numpy.allclose(npy_result, gpuary_result, rtol=rtol,
+                                  equal_nan=True)

From 159133fc407588846a96e6722138c9364e7eda10 Mon Sep 17 00:00:00 2001
From: Holger Kohr <h.kohr@cwi.nl>
Date: Wed, 28 Dec 2016 19:38:46 +0100
Subject: [PATCH 03/29] TST: add test and fixes for binary ufuncs with scalars

---
 pygpu/tests/test_ufuncs.py | 42 +++++++++++++++++++++++++++
 pygpu/ufuncs.py            | 58 ++++++++++++++++++++++++++++++--------
 2 files changed, 88 insertions(+), 12 deletions(-)

diff --git a/pygpu/tests/test_ufuncs.py b/pygpu/tests/test_ufuncs.py
index 61e06b2d59..fa0a25bec6 100644
--- a/pygpu/tests/test_ufuncs.py
+++ b/pygpu/tests/test_ufuncs.py
@@ -175,6 +175,15 @@ def test_all(self):
                 for dtype2 in dtypes:
                     self.check_binary_ufunc(ufunc, dtype1, dtype2)
 
+                scalars = [1, 5]
+                if ufunc not in ('fmod', 'remainder', 'floor_divide'):
+                    scalars.append(0)
+                if not numpy.issubsctype(dtype1, numpy.unsignedinteger):
+                    scalars += [-1, -2]
+
+                for scalar in scalars:
+                    self.check_binary_ufunc_scalar(ufunc, scalar, dtype1)
+
     def check_binary_ufunc(self, ufunc, dtype1, dtype2):
         """Test GpuArray binary ufunc against equivalent Numpy result."""
         gpuary_ufunc = getattr(ufuncs, ufunc)
@@ -216,3 +225,36 @@ def check_binary_ufunc(self, ufunc, dtype1, dtype2):
             gpuary_ufunc(gpuary_arr, gpuary_arr2, out)
             assert numpy.allclose(npy_result, gpuary_result, rtol=rtol,
                                   equal_nan=True)
+
+    def check_binary_ufunc_scalar(self, ufunc, scalar, dtype):
+        """Test GpuArray binary ufunc with scalar second operand."""
+        gpuary_ufunc = getattr(ufuncs, ufunc)
+        npy_ufunc = getattr(numpy, ufunc)
+
+        npy_arr, gpuary_arr = npy_and_gpuary_arrays(shape=(2, 3),
+                                                    dtype=dtype,
+                                                    positive=True)
+
+        try:
+            res = numpy.finfo(dtype).resolution
+        except ValueError:
+            res = numpy.finfo(numpy.promote_types(dtype,
+                                                  numpy.float16)).resolution
+        rtol = 10 * res
+
+        try:
+            npy_result = npy_ufunc(npy_arr, scalar)
+        except TypeError:
+            # Make sure we raise the same error as Numpy
+            with self.assertRaises(TypeError):
+                gpuary_ufunc(gpuary_arr, scalar)
+        else:
+            if npy_result.dtype == numpy.dtype('float16'):
+                # We use float32 as minimum for GPU arrays, do the same here
+                npy_result = npy_result.astype('float32')
+
+            gpuary_result = gpuary_ufunc(gpuary_arr, scalar)
+            assert npy_result.shape == gpuary_result.shape
+            assert npy_result.dtype == gpuary_result.dtype
+            assert numpy.allclose(npy_result, gpuary_result, rtol=rtol,
+                                  atol=rtol, equal_nan=True)
diff --git a/pygpu/ufuncs.py b/pygpu/ufuncs.py
index 492d18e3b5..01ad9fdf04 100644
--- a/pygpu/ufuncs.py
+++ b/pygpu/ufuncs.py
@@ -677,7 +677,15 @@ def binary_ufunc(a, b, ufunc_name, out=None):
 
     # TODO: can CPU memory handed directly to kernels?
     if not isinstance(a, GpuArray):
-        a = numpy.asarray(a)
+        if numpy.isscalar(a):
+            # TODO: this is quite hacky, perhaps mixed input signatures
+            # should be handled in ufunc_dtypes?
+            if ufunc_name == 'ldexp':
+                # Want signed type
+                a = numpy.asarray(a, dtype=numpy.min_scalar_type(-abs(a)))
+            else:
+                a = numpy.asarray(a, dtype=numpy.result_type(a, b))
+
         if a.flags.f_contiguous and not a.flags.c_contiguous:
             order = 'F'
         else:
@@ -685,7 +693,15 @@ def binary_ufunc(a, b, ufunc_name, out=None):
         a = array(a, dtype=a.dtype, copy=False, order=order, context=ctx,
                   cls=cls)
     if not isinstance(b, GpuArray):
-        b = numpy.asarray(b)
+        if numpy.isscalar(b):
+            # TODO: this is quite hacky, perhaps mixed input signatures
+            # should be handled in ufunc_dtypes?
+            if ufunc_name == 'ldexp':
+                # Want signed type
+                b = numpy.asarray(b, dtype=numpy.min_scalar_type(-abs(b)))
+            else:
+                b = numpy.asarray(b, dtype=numpy.result_type(a, b))
+
         if b.flags.f_contiguous and not b.flags.c_contiguous:
             order = 'F'
         else:
@@ -746,8 +762,9 @@ def binary_ufunc(a, b, ufunc_name, out=None):
         if ufunc_name == 'power':
             # Arguments to `pow` cannot be integer, need to cast
             if numpy.issubsctype(result_dtype, numpy.integer):
-                tpl = 'res = ({rt}) (long) (pow((double) a, (double) b) + 0.5)'
-                oper = tpl.format(rt=c_res_dtype)
+                eps = 0.001
+                tpl = 'res = ({rt}) (long) (pow((double) a, (double) b) + {})'
+                oper = tpl.format(eps, rt=c_res_dtype)
             else:
                 oper = 'res = ({rt}) pow(({rt}) a, ({rt}) b)'.format(
                     rt=c_res_dtype)
@@ -772,7 +789,18 @@ def binary_ufunc(a, b, ufunc_name, out=None):
     else:
         # Other cases: specific functions
         if ufunc_name == 'floor_divide':
-            oper = 'res = ({}) (long)(a / b)'.format(c_res_dtype)
+            # implement as sign(a/b) * int(abs(a/b) + shift(a,b))
+            # where shift(a,b) = 0 if sign(a) == sign(b) else 1 - epsilon
+            sign_part = '(((a < 0) != (b < 0)) ? -1 : 1)'
+            # TODO: this computes twice - better solution?
+            abs_div_part = ('(((double)a / b < 0) ? ' +
+                            '-(double)a / b : ' +
+                            '(double)a / b)')
+            shift_part = '(((a < 0) != (b < 0)) ? 0.999 : 0)'
+            oper = 'res = ({}) ({} * (long)({} + {}))'.format(c_res_dtype,
+                                                              sign_part,
+                                                              abs_div_part,
+                                                              shift_part)
 
         if ufunc_name == 'true_divide':
             if result_dtype == numpy.dtype('float64'):
@@ -792,13 +820,19 @@ def binary_ufunc(a, b, ufunc_name, out=None):
             oper = 'res = ({}) ((a < b) ? a : b)'.format(c_res_dtype)
 
         elif ufunc_name == 'remainder':
-            # Cannot use `out` in the first call since the dtype may differ
-            # from `result_dtype`, and `copysign` has no integer signature.
-            # TODO: doable in one call?
-            res = binary_ufunc(a, b, 'fmod')
-            res = binary_ufunc(res, b, 'copysign')
-            out[:] = res.astype(result_dtype)
-            return out
+            # The same as `fmod` except for b < 0, where we have
+            # remainder(a,b) = fmod(a, b)
+            if numpy.issubsctype(result_dtype, numpy.integer):
+                cast_type = 'double'
+            else:
+                cast_type = c_res_dtype
+
+            # TODO: this computes fmod(a,b) twice for b < 0, better solution?
+            mod_expr = 'fmod(({ct}) a, ({ct}) b)'.format(ct=cast_type)
+            comp_part = ('(((b < 0) && ({expr} != 0)) ? ' +
+                         '({ct})b + {expr} : ' +
+                         '{expr})').format(ct=cast_type, expr=mod_expr)
+            oper = 'res = ({rt}) {}'.format(comp_part, rt=c_res_dtype)
 
     if not oper:
         raise ValueError("`ufunc_name` '{}' does not represent a binary "

From 66d69b92fe15191a603f31bf3092699215113ea2 Mon Sep 17 00:00:00 2001
From: Holger Kohr <h.kohr@cwi.nl>
Date: Fri, 3 Mar 2017 18:44:45 +0100
Subject: [PATCH 04/29] TEMP: insert debug print into call_compiler to see
 kernel source

---
 src/gpuarray_buffer_cuda.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c
index 8f6e2cdb6f..4bfb128f36 100644
--- a/src/gpuarray_buffer_cuda.c
+++ b/src/gpuarray_buffer_cuda.c
@@ -1076,6 +1076,8 @@ static int call_compiler(cuda_context *ctx, strb *src, strb *ptx, strb *log) {
     , "-G", "-lineinfo"
   };
   nvrtcResult err;
+  // DEBUG: print the kernel source
+  // printf("%s\n", src);
 
   opts[1] = ctx->bin_id;
 

From f03bf65d0fd12d147cfc6c4dfdc605510aa0868c Mon Sep 17 00:00:00 2001
From: Holger Kohr <h.kohr@cwi.nl>
Date: Fri, 3 Mar 2017 18:47:18 +0100
Subject: [PATCH 05/29] TST: rewrite tests in yield style

---
 pygpu/tests/test_ufuncs.py | 425 ++++++++++++++++++-------------------
 1 file changed, 210 insertions(+), 215 deletions(-)

diff --git a/pygpu/tests/test_ufuncs.py b/pygpu/tests/test_ufuncs.py
index fa0a25bec6..69937b5bd5 100644
--- a/pygpu/tests/test_ufuncs.py
+++ b/pygpu/tests/test_ufuncs.py
@@ -1,11 +1,11 @@
-from itertools import chain
+from nose.tools import assert_raises
 import numpy
-from unittest import TestCase
 
 import pygpu
+from pygpu.dtypes import NAME_TO_DTYPE
 from pygpu import ufuncs
 from pygpu.ufuncs import UNARY_UFUNCS, BINARY_UFUNCS
-from .support import context
+from pygpu.tests.support import context
 
 
 pygpu.set_default_context(context)
@@ -16,7 +16,9 @@
 
 def npy_and_gpuary_arrays(shape, dtype, positive=False):
     """Return Numpy and GPU arrays of given shape and dtype."""
-    if numpy.issubsctype(dtype, numpy.unsignedinteger):
+    if dtype == numpy.bool:
+        npy_arr = numpy.random.randint(0, 1, size=shape, dtype=dtype)
+    elif numpy.issubsctype(dtype, numpy.unsignedinteger):
         npy_arr = numpy.random.randint(1, 10, size=shape, dtype=dtype)
     elif numpy.issubsctype(dtype, numpy.integer):
         low = 1 if positive else -10
@@ -40,221 +42,214 @@ def npy_and_gpuary_arrays(shape, dtype, positive=False):
 
 reductions = ['sum', 'prod', 'amin', 'amax']
 axis_params = [None, 0, 2, (1, 2), (0,), (0, 1), (0, 1, 2)]
-dtypes = [numpy.dtype(dt)
-          for dt in chain(numpy.sctypes['int'], numpy.sctypes['uint'],
-                          numpy.sctypes['float'])
-          if numpy.dtype(dt) not in [numpy.dtype('float16'),
-                                     numpy.dtype('float128'),
-                                     numpy.dtype('complex256')]]
+dtypes = set(NAME_TO_DTYPE.values())
 
 # --- Ufuncs & Reductions --- #
 
 
-class test_reduction(TestCase):
-
-    def test_all(self):
-        for reduction in reductions:
-            for axis in axis_params:
-                for keepdims in [True, False]:
-                    for dtype in dtypes:
-                        self.check_reduction(reduction, dtype, axis, keepdims)
-
-    def check_reduction(self, reduction, dtype, axis, keepdims):
-        """Test GpuArray reduction against equivalent Numpy result."""
-        gpuary_reduction = getattr(ufuncs, reduction)
-        npy_reduction = getattr(numpy, reduction)
-
-        npy_arr, gpuary_arr = npy_and_gpuary_arrays(shape=(2, 3, 4),
-                                                    dtype=dtype)
-        # Determine relative tolerance from dtype
-        try:
-            res = numpy.finfo(dtype).resolution
-        except ValueError:
-            res = 0
-        rtol = 2 * npy_arr.size * res
-
-        if (numpy.issubsctype(dtype, numpy.complexfloating) and
-                reduction in ('amin', 'amax')):
-            with self.assertRaises(ValueError):
-                gpuary_reduction(gpuary_arr, axis=axis, keepdims=keepdims)
-            return
-
-        # No explicit out dtype
-        npy_result = npy_reduction(npy_arr, axis=axis, keepdims=keepdims)
+def test_reduction():
+    for reduction in reductions:
+        for axis in axis_params:
+            for keepdims in [True, False]:
+                for dtype in dtypes:
+                    yield check_reduction, reduction, dtype, axis, keepdims
+
+
+def check_reduction(reduction, dtype, axis, keepdims):
+    """Test GpuArray reduction against equivalent Numpy result."""
+    gpuary_reduction = getattr(ufuncs, reduction)
+    npy_reduction = getattr(numpy, reduction)
+
+    npy_arr, gpuary_arr = npy_and_gpuary_arrays(shape=(2, 3, 4),
+                                                dtype=dtype)
+    # Determine relative tolerance from dtype
+    try:
+        res = numpy.finfo(dtype).resolution
+    except ValueError:
+        res = 0
+    rtol = 2 * npy_arr.size * res
+
+    if (numpy.issubsctype(dtype, numpy.complexfloating) and
+            reduction in ('amin', 'amax')):
+        with assert_raises(ValueError):
+            gpuary_reduction(gpuary_arr, axis=axis, keepdims=keepdims)
+        return
+
+    # No explicit out dtype
+    npy_result = npy_reduction(npy_arr, axis=axis, keepdims=keepdims)
+    gpuary_result = gpuary_reduction(gpuary_arr, axis=axis,
+                                     keepdims=keepdims)
+    if numpy.isscalar(npy_result):
+        assert numpy.isscalar(gpuary_result)
+        assert numpy.isclose(gpuary_result, npy_result, rtol=rtol,
+                             atol=rtol)
+    else:
+        assert npy_result.shape == gpuary_result.shape
+        assert npy_result.dtype == gpuary_result.dtype
+        assert numpy.allclose(npy_result, gpuary_result, rtol=rtol,
+                              atol=rtol)
+
+    # With out array
+    out = pygpu.empty(npy_result.shape, npy_result.dtype)
+    gpuary_result = gpuary_reduction(gpuary_arr, axis=axis, out=out,
+                                     keepdims=keepdims)
+    assert numpy.allclose(npy_result, out, rtol=rtol, atol=rtol)
+    assert out is gpuary_result
+
+    # Explicit out dtype, supported by some reductions only
+    if reduction in ('sum', 'prod'):
+        out_dtype = numpy.promote_types(dtype, numpy.float32)
         gpuary_result = gpuary_reduction(gpuary_arr, axis=axis,
+                                         dtype=out_dtype,
                                          keepdims=keepdims)
-        if numpy.isscalar(npy_result):
-            assert numpy.isscalar(gpuary_result)
-            assert numpy.isclose(gpuary_result, npy_result, rtol=rtol,
-                                 atol=rtol)
-        else:
-            assert npy_result.shape == gpuary_result.shape
-            assert npy_result.dtype == gpuary_result.dtype
-            assert numpy.allclose(npy_result, gpuary_result, rtol=rtol,
-                                  atol=rtol)
-
-        # With out array
-        out = pygpu.empty(npy_result.shape, npy_result.dtype)
-        gpuary_result = gpuary_reduction(gpuary_arr, axis=axis, out=out,
-                                         keepdims=keepdims)
+        assert gpuary_result.dtype == out_dtype
         assert numpy.allclose(npy_result, out, rtol=rtol, atol=rtol)
-        assert out is gpuary_result
-
-        # Explicit out dtype, supported by some reductions only
-        if reduction in ('sum', 'prod'):
-            out_dtype = numpy.promote_types(dtype, numpy.float32)
-            gpuary_result = gpuary_reduction(gpuary_arr, axis=axis,
-                                             dtype=out_dtype,
-                                             keepdims=keepdims)
-            assert gpuary_result.dtype == out_dtype
-            assert numpy.allclose(npy_result, out, rtol=rtol, atol=rtol)
-
-        # Using numpy arrays as input
-        gpuary_result = gpuary_reduction(npy_arr, axis=axis, keepdims=keepdims)
-        if numpy.isscalar(npy_result):
-            assert numpy.isscalar(gpuary_result)
-            assert numpy.isclose(gpuary_result, npy_result, rtol=rtol,
-                                 atol=rtol)
-        else:
-            assert npy_result.shape == gpuary_result.shape
-            assert npy_result.dtype == gpuary_result.dtype
-            assert numpy.allclose(npy_result, gpuary_result, rtol=rtol,
-                                  atol=rtol)
-
-
-class test_unary_ufuncs(TestCase):
-
-    def test_all(self):
-        for ufunc in UNARY_UFUNCS:
-            for dtype in dtypes:
-                self.check_unary_ufunc(ufunc, dtype)
-
-    def check_unary_ufunc(self, ufunc, dtype):
-        """Test GpuArray unary ufuncs against equivalent Numpy results."""
-        gpuary_ufunc = getattr(ufuncs, ufunc)
-        npy_ufunc = getattr(numpy, ufunc)
-
-        npy_arr, gpuary_arr = npy_and_gpuary_arrays(shape=(2, 3), dtype=dtype,
-                                                    positive=True)
-        try:
-            res = numpy.finfo(dtype).resolution
-        except ValueError:
-            res = numpy.finfo(numpy.promote_types(dtype,
-                                                  numpy.float16)).resolution
-        rtol = 10 * res
-
-        try:
-            npy_result = npy_ufunc(npy_arr)
-        except TypeError:
-            # Make sure we raise the same error as Numpy
-            with self.assertRaises(TypeError):
-                gpuary_ufunc(gpuary_arr)
-        else:
-            if npy_result.dtype == numpy.dtype('float16'):
-                # We use float32 as minimum for GPU arrays, do the same here
-                npy_result = npy_result.astype('float32')
-
-            gpuary_result = gpuary_ufunc(gpuary_arr)
-            assert npy_result.shape == gpuary_result.shape
-            assert npy_result.dtype == gpuary_result.dtype
-            assert numpy.allclose(npy_result, gpuary_result, rtol=rtol,
-                                  atol=rtol, equal_nan=True)
-
-            # In-place
-            out = pygpu.empty(npy_result.shape, npy_result.dtype)
-            gpuary_ufunc(gpuary_arr, out)
-            assert numpy.allclose(npy_result, gpuary_result, rtol=rtol,
-                                  equal_nan=True)
-
-
-class test_binary_ufuncs(TestCase):
-
-    def test_all(self):
-        for ufunc in BINARY_UFUNCS:
-            for dtype1 in dtypes:
-                for dtype2 in dtypes:
-                    self.check_binary_ufunc(ufunc, dtype1, dtype2)
-
-                scalars = [1, 5]
-                if ufunc not in ('fmod', 'remainder', 'floor_divide'):
-                    scalars.append(0)
-                if not numpy.issubsctype(dtype1, numpy.unsignedinteger):
-                    scalars += [-1, -2]
-
-                for scalar in scalars:
-                    self.check_binary_ufunc_scalar(ufunc, scalar, dtype1)
-
-    def check_binary_ufunc(self, ufunc, dtype1, dtype2):
-        """Test GpuArray binary ufunc against equivalent Numpy result."""
-        gpuary_ufunc = getattr(ufuncs, ufunc)
-        npy_ufunc = getattr(numpy, ufunc)
-
-        npy_arr, gpuary_arr = npy_and_gpuary_arrays(shape=(2, 3),
-                                                    dtype=dtype1,
-                                                    positive=True)
-        npy_arr2, gpuary_arr2 = npy_and_gpuary_arrays(shape=(2, 3),
-                                                      dtype=dtype2,
-                                                      positive=True)
-
-        try:
-            res = numpy.finfo(numpy.result_type(dtype1, dtype2)).resolution
-        except ValueError:
-            res = numpy.finfo(numpy.result_type(numpy.float16,
-                                                dtype1, dtype2)).resolution
-        rtol = 10 * res
-
-        try:
-            npy_result = npy_ufunc(npy_arr, npy_arr2)
-        except TypeError:
-            # Make sure we raise the same error as Numpy
-            with self.assertRaises(TypeError):
-                gpuary_ufunc(gpuary_arr, gpuary_arr2)
-        else:
-            if npy_result.dtype == numpy.dtype('float16'):
-                # We use float32 as minimum for GPU arrays, do the same here
-                npy_result = npy_result.astype('float32')
-
-            gpuary_result = gpuary_ufunc(gpuary_arr, gpuary_arr2)
-            assert npy_result.shape == gpuary_result.shape
-            assert npy_result.dtype == gpuary_result.dtype
-            assert numpy.allclose(npy_result, gpuary_result, rtol=rtol,
-                                  atol=rtol, equal_nan=True)
-
-            # In-place
-            out = pygpu.empty(npy_result.shape, npy_result.dtype)
-            gpuary_ufunc(gpuary_arr, gpuary_arr2, out)
-            assert numpy.allclose(npy_result, gpuary_result, rtol=rtol,
-                                  equal_nan=True)
-
-    def check_binary_ufunc_scalar(self, ufunc, scalar, dtype):
-        """Test GpuArray binary ufunc with scalar second operand."""
-        gpuary_ufunc = getattr(ufuncs, ufunc)
-        npy_ufunc = getattr(numpy, ufunc)
-
-        npy_arr, gpuary_arr = npy_and_gpuary_arrays(shape=(2, 3),
-                                                    dtype=dtype,
-                                                    positive=True)
-
-        try:
-            res = numpy.finfo(dtype).resolution
-        except ValueError:
-            res = numpy.finfo(numpy.promote_types(dtype,
-                                                  numpy.float16)).resolution
-        rtol = 10 * res
-
-        try:
-            npy_result = npy_ufunc(npy_arr, scalar)
-        except TypeError:
-            # Make sure we raise the same error as Numpy
-            with self.assertRaises(TypeError):
-                gpuary_ufunc(gpuary_arr, scalar)
-        else:
-            if npy_result.dtype == numpy.dtype('float16'):
-                # We use float32 as minimum for GPU arrays, do the same here
-                npy_result = npy_result.astype('float32')
-
-            gpuary_result = gpuary_ufunc(gpuary_arr, scalar)
-            assert npy_result.shape == gpuary_result.shape
-            assert npy_result.dtype == gpuary_result.dtype
-            assert numpy.allclose(npy_result, gpuary_result, rtol=rtol,
-                                  atol=rtol, equal_nan=True)
+
+    # Using numpy arrays as input
+    gpuary_result = gpuary_reduction(npy_arr, axis=axis, keepdims=keepdims)
+    if numpy.isscalar(npy_result):
+        assert numpy.isscalar(gpuary_result)
+        assert numpy.isclose(gpuary_result, npy_result, rtol=rtol,
+                             atol=rtol)
+    else:
+        assert npy_result.shape == gpuary_result.shape
+        assert npy_result.dtype == gpuary_result.dtype
+        assert numpy.allclose(npy_result, gpuary_result, rtol=rtol,
+                              atol=rtol)
+
+
+def test_unary_ufuncs():
+    for ufunc in UNARY_UFUNCS:
+        for dtype in dtypes:
+            yield check_unary_ufunc, ufunc, dtype
+
+
+def check_unary_ufunc(ufunc, dtype):
+    """Test GpuArray unary ufuncs against equivalent Numpy results."""
+    gpuary_ufunc = getattr(ufuncs, ufunc)
+    npy_ufunc = getattr(numpy, ufunc)
+
+    npy_arr, gpuary_arr = npy_and_gpuary_arrays(shape=(2, 3), dtype=dtype,
+                                                positive=True)
+    try:
+        res = numpy.finfo(dtype).resolution
+    except ValueError:
+        res = numpy.finfo(numpy.promote_types(dtype,
+                                              numpy.float16)).resolution
+    rtol = 10 * res
+
+    try:
+        npy_result = npy_ufunc(npy_arr)
+    except TypeError:
+        # Make sure we raise the same error as Numpy
+        with assert_raises(TypeError):
+            gpuary_ufunc(gpuary_arr)
+    else:
+        if npy_result.dtype == numpy.dtype('float16'):
+            # We use float32 as minimum for GPU arrays, do the same here
+            npy_result = npy_result.astype('float32')
+
+        gpuary_result = gpuary_ufunc(gpuary_arr)
+        assert npy_result.shape == gpuary_result.shape
+        assert npy_result.dtype == gpuary_result.dtype
+        assert numpy.allclose(npy_result, gpuary_result, rtol=rtol,
+                              atol=rtol, equal_nan=True)
+
+        # In-place
+        out = pygpu.empty(npy_result.shape, npy_result.dtype)
+        gpuary_ufunc(gpuary_arr, out)
+        assert numpy.allclose(npy_result, gpuary_result, rtol=rtol,
+                              equal_nan=True)
+
+
+def test_binary_ufuncs():
+    for ufunc in BINARY_UFUNCS:
+        for dtype1 in dtypes:
+            for dtype2 in dtypes:
+                yield check_binary_ufunc, ufunc, dtype1, dtype2
+
+            scalars = [1, 5]
+            if ufunc not in ('fmod', 'remainder', 'floor_divide'):
+                scalars.append(0)
+            if not numpy.issubsctype(dtype1, numpy.unsignedinteger):
+                scalars += [-1, -2]
+
+            for scalar in scalars:
+                yield check_binary_ufunc_scalar, ufunc, scalar, dtype1
+
+
+def check_binary_ufunc(ufunc, dtype1, dtype2):
+    """Test GpuArray binary ufunc against equivalent Numpy result."""
+    gpuary_ufunc = getattr(ufuncs, ufunc)
+    npy_ufunc = getattr(numpy, ufunc)
+
+    npy_arr, gpuary_arr = npy_and_gpuary_arrays(shape=(2, 3),
+                                                dtype=dtype1,
+                                                positive=True)
+    npy_arr2, gpuary_arr2 = npy_and_gpuary_arrays(shape=(2, 3),
+                                                  dtype=dtype2,
+                                                  positive=True)
+
+    try:
+        res = numpy.finfo(numpy.result_type(dtype1, dtype2)).resolution
+    except ValueError:
+        res = numpy.finfo(numpy.result_type(numpy.float16,
+                                            dtype1, dtype2)).resolution
+    rtol = 10 * res
+
+    try:
+        npy_result = npy_ufunc(npy_arr, npy_arr2)
+    except TypeError:
+        # Make sure we raise the same error as Numpy
+        with assert_raises(TypeError):
+            gpuary_ufunc(gpuary_arr, gpuary_arr2)
+    else:
+        if npy_result.dtype == numpy.dtype('float16'):
+            # We use float32 as minimum for GPU arrays, do the same here
+            npy_result = npy_result.astype('float32')
+
+        gpuary_result = gpuary_ufunc(gpuary_arr, gpuary_arr2)
+        assert npy_result.shape == gpuary_result.shape
+        assert npy_result.dtype == gpuary_result.dtype
+        assert numpy.allclose(npy_result, gpuary_result, rtol=rtol,
+                              atol=rtol, equal_nan=True)
+
+        # In-place
+        out = pygpu.empty(npy_result.shape, npy_result.dtype)
+        gpuary_ufunc(gpuary_arr, gpuary_arr2, out)
+        assert numpy.allclose(npy_result, gpuary_result, rtol=rtol,
+                              equal_nan=True)
+
+
+def check_binary_ufunc_scalar(ufunc, scalar, dtype):
+    """Test GpuArray binary ufunc with scalar second operand."""
+    gpuary_ufunc = getattr(ufuncs, ufunc)
+    npy_ufunc = getattr(numpy, ufunc)
+
+    npy_arr, gpuary_arr = npy_and_gpuary_arrays(shape=(2, 3),
+                                                dtype=dtype,
+                                                positive=True)
+
+    try:
+        res = numpy.finfo(dtype).resolution
+    except ValueError:
+        res = numpy.finfo(numpy.promote_types(dtype,
+                                              numpy.float16)).resolution
+    rtol = 10 * res
+
+    try:
+        npy_result = npy_ufunc(npy_arr, scalar)
+    except TypeError:
+        # Make sure we raise the same error as Numpy
+        with assert_raises(TypeError):
+            gpuary_ufunc(gpuary_arr, scalar)
+    else:
+        if npy_result.dtype == numpy.dtype('float16'):
+            # We use float32 as minimum for GPU arrays, do the same here
+            npy_result = npy_result.astype('float32')
+
+        gpuary_result = gpuary_ufunc(gpuary_arr, scalar)
+        assert npy_result.shape == gpuary_result.shape
+        assert npy_result.dtype == gpuary_result.dtype
+        assert numpy.allclose(npy_result, gpuary_result, rtol=rtol,
+                              atol=rtol, equal_nan=True)

From 988adf83bf4dd595ac8f7d8f05ef0986da87e102 Mon Sep 17 00:00:00 2001
From: Holger Kohr <h.kohr@cwi.nl>
Date: Fri, 3 Mar 2017 18:57:24 +0100
Subject: [PATCH 06/29] MAINT: various minor fixes

---
 pygpu/ufuncs.py | 148 +++++++++++++++++++++++-------------------------
 1 file changed, 71 insertions(+), 77 deletions(-)

diff --git a/pygpu/ufuncs.py b/pygpu/ufuncs.py
index 01ad9fdf04..d20c5d3511 100644
--- a/pygpu/ufuncs.py
+++ b/pygpu/ufuncs.py
@@ -1,18 +1,18 @@
 ﻿"""Ufuncs and reductions for GPU arrays."""
 
-from itertools import chain
 import numpy
 import re
-from ._array import ndgpuarray
-from .dtypes import dtype_to_ctype
-from ._elemwise import arg
-from .elemwise import as_argument, GpuElemwise
-from .reduction import reduce1
-from .gpuarray import GpuArray, array, empty, get_default_context
+from pygpu._array import ndgpuarray
+from pygpu.dtypes import dtype_to_ctype, NAME_TO_DTYPE
+from pygpu._elemwise import arg
+from pygpu.elemwise import as_argument, GpuElemwise
+from pygpu.reduction import reduce1
+from pygpu.gpuarray import GpuArray, array, empty, get_default_context
 
 
 # --- Helper functions --- #
 
+
 def restore_reduced_dims(shape, red_axes):
     """Return tuple from ``shape`` with size-1 axes at indices ``red_axes``."""
     newshape = list(shape)
@@ -40,19 +40,23 @@ def reduce_dims(shape, red_axes):
 
 def _prepare_array_for_reduction(a, out):
     """Return input array ready for usage in a reduction kernel."""
-    # Get a context and an array class to work with
+    # Get a context and an array class to work with. Use the "highest"
+    # class present in the inputs.
     need_context = True
+    ctx = None
+    cls = None
     for ary in (a, out):
         if isinstance(ary, GpuArray):
+            if ctx is not None and ary.context != ctx:
+                raise ValueError('cannot mix contexts')
             ctx = ary.context
-            cls = ary.__class__
+            if cls is None or cls == GpuArray:
+                cls = ary.__class__
             need_context = False
-            break
+
     if need_context:
         ctx = get_default_context()
-        # TODO: sensible choice? Makes sense to choose the more "feature-rich"
-        # variant here perhaps.
-        cls = ndgpuarray
+        cls = ndgpuarray  # TODO: sensible choice as default?
 
     # TODO: can CPU memory handed directly to kernels?
     if not isinstance(a, GpuArray):
@@ -99,18 +103,13 @@ def reduce_with_op(a, op, neutral, axis=None, dtype=None, out=None,
         is a scalar.
     """
     a = _prepare_array_for_reduction(a, out)
-    if a.dtype == numpy.dtype('float16') or dtype == numpy.dtype('float16'):
-        # Gives wrong results currently, see
-        # https://github.com/Theano/libgpuarray/issues/316
-        raise NotImplementedError('float16 currently broken')
 
     if dtype is None:
         if numpy.issubsctype(a.dtype, numpy.unsignedinteger):
-            # Avoid overflow for small integer types by default, as in
-            # Numpy
-            out_type = numpy.dtype('uint64')
+            # Avoid overflow for small integer types by default, as in Numpy
+            out_type = max(a.dtype, numpy.dtype('uint'))
         elif numpy.issubsctype(a.dtype, numpy.integer):
-            out_type = numpy.dtype('int64')
+            out_type = max(a.dtype, numpy.dtype('int'))
         else:
             out_type = a.dtype
     else:
@@ -189,10 +188,6 @@ def reduce_with_cmp(a, cmp, neutral, axis=None, out=None, keepdims=False):
         is a scalar.
     """
     a = _prepare_array_for_reduction(a, out)
-    if a.dtype == numpy.dtype('float16'):
-        # Gives wrong results currently, see
-        # https://github.com/Theano/libgpuarray/issues/316
-        raise NotImplementedError('float16 currently broken')
 
     axes = axis if axis is not None else tuple(range(a.ndim))
     if out is not None:
@@ -223,16 +218,18 @@ def amin(a, axis=None, out=None, keepdims=False):
     numpy.amin
     """
     a = _prepare_array_for_reduction(a, out)
-    if numpy.issubsctype(a.dtype, numpy.integer):
+    if a.dtype == numpy.bool:
+        neutral = 1
+    elif numpy.issubsctype(a.dtype, numpy.integer):
         neutral = numpy.iinfo(a.dtype).max
     elif numpy.issubsctype(a.dtype, numpy.floating):
-        neutral = numpy.finfo(a.dtype).max
+        neutral = numpy.inf
     elif numpy.issubsctype(a.dtype, numpy.complexfloating):
-        raise ValueError("array dtype '{}' not comparable"
-                         "".format(a.dtype.name))
+        raise ValueError('array dtype {!r} not comparable'
+                         ''.format(a.dtype.name))
     else:
-        raise ValueError("array dtype '{}' not supported"
-                         "".format(a.dtype.name))
+        raise ValueError('array dtype {!r} not supported'
+                         ''.format(a.dtype.name))
     return reduce_with_cmp(a, '<', neutral, axis, out, keepdims)
 
 
@@ -244,22 +241,26 @@ def amax(a, axis=None, out=None, keepdims=False):
     numpy.amax
     """
     a = _prepare_array_for_reduction(a, out)
-    if numpy.issubsctype(a.dtype, numpy.integer):
+    if a.dtype == numpy.bool:
+        neutral = 0
+    elif numpy.issubsctype(a.dtype, numpy.integer):
         neutral = numpy.iinfo(a.dtype).min
     elif numpy.issubsctype(a.dtype, numpy.floating):
-        neutral = numpy.finfo(a.dtype).min
+        neutral = -numpy.inf
     elif numpy.issubsctype(a.dtype, numpy.complexfloating):
-        raise ValueError("array dtype '{}' not comparable"
-                         "".format(a.dtype.name))
+        raise ValueError('array dtype {!r} not comparable'
+                         ''.format(a.dtype.name))
     else:
-        raise ValueError("array dtype '{}' not supported"
-                         "".format(a.dtype.name))
+        raise ValueError('array dtype {!r} not supported'
+                         ''.format(a.dtype.name))
     return reduce_with_cmp(a, '>', neutral, axis, out, keepdims)
 
 
 # --- Elementwise ufuncs --- #
 
 
+# This dictionary is derived from Numpy's C99_FUNCS list, see
+# https://github.com/numpy/numpy/search?q=C99_FUNCS
 UNARY_C_FUNC_TO_UFUNC = {
     'abs': 'absolute',
     'acos': 'arccos',
@@ -289,20 +290,16 @@ def amax(a, axis=None, out=None, keepdims=False):
     'tanh': 'tanh',
     'trunc': 'trunc',
     }
-
 UNARY_UFUNC_TO_C_FUNC = {v: k for k, v in UNARY_C_FUNC_TO_UFUNC.items()}
-
 UNARY_UFUNC_TO_C_OP = {
     'bitwise_not': '~',
     'logical_not': '!',
     'negative': '-',
     }
-
 UNARY_UFUNCS = (list(UNARY_UFUNC_TO_C_FUNC.keys()) +
                 list(UNARY_UFUNC_TO_C_OP.keys()))
 UNARY_UFUNCS.extend(['deg2rad', 'rad2deg', 'reciprocal', 'sign', 'signbit',
                      'square'])
-
 BINARY_C_FUNC_TO_UFUNC = {
     'atan2': 'arctan2',
     'copysign': 'copysign',
@@ -312,9 +309,7 @@ def amax(a, axis=None, out=None, keepdims=False):
     'nextafter': 'nextafter',
     'fmod': 'fmod',
 }
-
 BINARY_UFUNC_TO_C_FUNC = {v: k for k, v in BINARY_C_FUNC_TO_UFUNC.items()}
-
 BINARY_UFUNC_TO_C_OP = {
     'add': '+',
     'bitwise_and': '&',
@@ -333,7 +328,6 @@ def amax(a, axis=None, out=None, keepdims=False):
     'right_shift': '>>',
     'subtract': '-',
     }
-
 BINARY_UFUNCS = (list(BINARY_UFUNC_TO_C_FUNC.keys()) +
                  list(BINARY_UFUNC_TO_C_OP.keys()))
 BINARY_UFUNCS.extend(['floor_divide', 'true_divide', 'logical_xor',
@@ -392,16 +386,9 @@ def ufunc_dtypes(ufunc_name, dtypes_in):
     ((dtype('float32'), dtype('float32')), (dtype('float32'),))
     """
     npy_ufunc = getattr(numpy, ufunc_name)
+    supported_dtypes = set(NAME_TO_DTYPE.values())
 
-    # TODO: get supported dtypes from some place within libgpuarray?
-    dtypes = [numpy.dtype(dt)
-              for dt in chain(numpy.sctypes['int'], numpy.sctypes['uint'],
-                              numpy.sctypes['float'])
-              if numpy.dtype(dt) not in [numpy.dtype('float16'),
-                                         numpy.dtype('float128'),
-                                         numpy.dtype('complex256')]]
-
-    # Filter for dtypes larger than our input types
+    # Filter for dtypes larger than our input dtypes, using only supported ones
     def larger_eq_than_dtypes(sig):
         from_part = sig.split('->')[0]
         if len(from_part) != len(dtypes_in):
@@ -409,20 +396,22 @@ def larger_eq_than_dtypes(sig):
         else:
             dts = tuple(numpy.dtype(c) for c in from_part)
             # Currently unsupported, filtering out
-            if any(dt not in dtypes for dt in dts):
+            if any(dt not in supported_dtypes for dt in dts):
                 return False
             else:
                 return all(dt >= dt_in for dt, dt_in in zip(dts, dtypes_in))
 
+    # List of ufunc signatures that are "larger" than our input dtypes
     larger_sig_list = list(filter(larger_eq_than_dtypes, npy_ufunc.types))
     if not larger_sig_list:
-        # TODO: Numpy raises TypeError for bad data types, which is wrong,
+        # Numpy raises TypeError for bad data types, which is not quite right,
         # but we mirror that behavior
         raise TypeError('data types {} not supported for ufunc {}'
                         ''.format(tuple(dt.name for dt in dtypes_in),
                                   ufunc_name))
 
-    # Key function for signature comparison
+    # Key function for signature comparison. It results in comparison of
+    # *all* typecodes in the signature since they are assembled in a tuple.
     def from_part_key(sig):
         from_part = sig.split('->')[0]
         return tuple(numpy.dtype(c) for c in from_part)
@@ -533,19 +522,23 @@ def unary_ufunc(a, ufunc_name, out=None):
     ufunc_result_dtype : Get dtype of a ufunc result.
     ufunc_c_funcname : Get C name for math ufuncs.
     """
-    # Get a context and an array class to work with
+    # Get a context and an array class to work with. Use the "highest"
+    # class present in the inputs.
     need_context = True
+    ctx = None
+    cls = None
     for ary in (a, out):
         if isinstance(ary, GpuArray):
+            if ctx is not None and ary.context != ctx:
+                raise ValueError('cannot mix contexts')
             ctx = ary.context
-            cls = ary.__class__
+            if cls is None or cls == GpuArray:
+                cls = ary.__class__
             need_context = False
-            break
+
     if need_context:
         ctx = get_default_context()
-        # TODO: sensible choice? Makes sense to choose the more "feature-rich"
-        # variant here perhaps.
-        cls = ndgpuarray
+        cls = ndgpuarray  # TODO: sensible choice as default?
 
     # TODO: can CPU memory handed directly to kernels?
     if not isinstance(a, GpuArray):
@@ -569,8 +562,8 @@ def unary_ufunc(a, ufunc_name, out=None):
     # This is the "fallback signature" case, for us it signals failure.
     # TypeError is what Numpy raises, too, which is kind of wrong
     if prom_dtype_in == numpy.dtype(object):
-        raise TypeError("input dtype '{}' invalid for ufunc '{}'"
-                        "".format(a.dtype.name, ufunc_name))
+        raise TypeError('input dtype {!r} invalid for ufunc {!r}'
+                        ''.format(a.dtype.name, ufunc_name))
 
     # Convert input such that the kernel runs
     # TODO: can this be avoided?
@@ -581,11 +574,10 @@ def unary_ufunc(a, ufunc_name, out=None):
     else:
         # TODO: allow larger dtype
         if out.dtype != result_dtype:
-            raise ValueError("`out.dtype` != result dtype ('{}' != '{}')"
-                             "".format(out.dtype.name, result_dtype.name))
+            raise ValueError('`out.dtype` != result dtype ({!r} != {!r})'
+                             ''.format(out.dtype.name, result_dtype.name))
 
     # C result dtype for casting
-    # TODO: necessary? overly cautious?
     c_res_dtype = dtype_to_ctype(result_dtype)
 
     oper = ''
@@ -623,8 +615,8 @@ def unary_ufunc(a, ufunc_name, out=None):
         oper = 'res = ({}) (a * a)'.format(c_res_dtype)
 
     if not oper:
-        raise ValueError("`ufunc_name` '{}' does not represent a unary ufunc"
-                         "".format(ufunc_name))
+        raise ValueError('`ufunc_name` {!r} does not represent a unary ufunc'
+                         ''.format(ufunc_name))
 
     a_arg = as_argument(a, 'a', read=True)
     args = [arg('res', out.dtype, write=True), a_arg]
@@ -721,8 +713,8 @@ def binary_ufunc(a, b, ufunc_name, out=None):
 
     # This is the "fallback signature" case, for us it signals failure
     if any(dt == numpy.dtype(object) for dt in prom_dtypes_in):
-        raise TypeError("input dtypes {} invalid for ufunc '{}'"
-                        "".format((a.dtype.name, b.dtype.name), ufunc_name))
+        raise TypeError('input dtypes {} invalid for ufunc {!r}'
+                        ''.format((a.dtype.name, b.dtype.name), ufunc_name))
 
     # Convert input such that the kernel runs
     # TODO: can this be avoided?
@@ -744,7 +736,7 @@ def binary_ufunc(a, b, ufunc_name, out=None):
             raise ValueError('`out.shape` != result shape ({} != {})'
                              ''.format(out.shape, result_shape))
         if out.dtype != result_dtype:
-            raise ValueError("`out.dtype` != result dtype ('{}' != '{}')"
+            raise ValueError('`out.dtype` != result dtype ({!r} != {!r})'
                              ''.format(out.dtype.name, result_dtype.name))
 
     a_arg = as_argument(a, 'a', read=True)
@@ -835,8 +827,8 @@ def binary_ufunc(a, b, ufunc_name, out=None):
             oper = 'res = ({rt}) {}'.format(comp_part, rt=c_res_dtype)
 
     if not oper:
-        raise ValueError("`ufunc_name` '{}' does not represent a binary "
-                         "ufunc".format(ufunc_name))
+        raise ValueError('`ufunc_name` {!r} does not represent a binary '
+                         'ufunc'.format(ufunc_name))
 
     kernel = GpuElemwise(a.context, oper, args, convert_f16=True)
     kernel(out, a, b, broadcast=True)
@@ -862,13 +854,16 @@ def binary_ufunc(a, b, ufunc_name, out=None):
     ('deg2rad', 'degrees'),
     ('rad2deg', 'radians'),
     ('true_divide', 'divide'),
-    ('maximum', 'fmax'),  # differ in NaN propagation in numpy, doable?
-    ('minimum', 'fmin'),  # differ in NaN propagation in numpy, doable?
+    ('maximum', 'fmax'),  # TODO: differ in NaN propagation in numpy, doable?
+    ('minimum', 'fmin'),  # TODO: differ in NaN propagation in numpy, doable?
     ('bitwise_not', 'invert'),
     ('remainder', 'mod')
     ]
 
 
+# --- Add ufuncs to global namespace --- #
+
+
 def make_unary_ufunc(name, doc):
     def wrapper(a, out=None):
         return unary_ufunc(a, name, out)
@@ -922,7 +917,6 @@ def wrapper(a, b, out=None):
 
 
 if __name__ == '__main__':
-    # pylint: disable=wrong-import-position
     from doctest import testmod, NORMALIZE_WHITESPACE
     import numpy
     optionflags = NORMALIZE_WHITESPACE

From 3579188a44d32f44935790c955113254bb971694 Mon Sep 17 00:00:00 2001
From: Holger Kohr <h.kohr@cwi.nl>
Date: Fri, 3 Mar 2017 18:58:13 +0100
Subject: [PATCH 07/29] ENH: add `all` and `any` reductions

---
 pygpu/ufuncs.py | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/pygpu/ufuncs.py b/pygpu/ufuncs.py
index d20c5d3511..314192ae70 100644
--- a/pygpu/ufuncs.py
+++ b/pygpu/ufuncs.py
@@ -151,11 +151,35 @@ def prod(a, axis=None, dtype=None, out=None, keepdims=False):
     --------
     numpy.prod
     """
+    # Do what Numpy does with booleans, sensible or not
+    if a.dtype == bool and dtype is None:
+        dtype = int
     return reduce_with_op(a, '*', 1, axis, dtype, out, keepdims)
 
 
+def all(a, axis=None, out=None, keepdims=False):
+    """Test whether all array elements along a given axis evaluate to True.
+
+    See Also
+    --------
+    numpy.all
+    """
+    return reduce_with_op(a, '&&', 1, axis, numpy.bool, out, keepdims)
+
+
+def any(a, axis=None, out=None, keepdims=False):
+    """Test whether all array elements along a given axis evaluate to True.
+
+    See Also
+    --------
+    numpy.all
+    """
+    return reduce_with_op(a, '||', 0, axis, numpy.bool, out, keepdims)
+
+
 # --- Reductions with comparison operators --- #
 
+
 def reduce_with_cmp(a, cmp, neutral, axis=None, out=None, keepdims=False):
     """Reduce ``a`` by comparison using ``cmp``.
 
@@ -385,6 +409,7 @@ def ufunc_dtypes(ufunc_name, dtypes_in):
     >>> ufunc_dtypes('power', [numpy.dtype('int8'), numpy.dtype('float32')])
     ((dtype('float32'), dtype('float32')), (dtype('float32'),))
     """
+    from builtins import all, any
     npy_ufunc = getattr(numpy, ufunc_name)
     supported_dtypes = set(NAME_TO_DTYPE.values())
 
@@ -653,6 +678,7 @@ def binary_ufunc(a, b, ufunc_name, out=None):
     --------
     pygpu.gpuarray.set_default_context
     """
+    from builtins import any
     # Get a context and an array class to work with
     need_context = True
     for ary in (a, b, out):

From 696751f85dce16244f1d388ae049dcc570909f28 Mon Sep 17 00:00:00 2001
From: Holger Kohr <h.kohr@cwi.nl>
Date: Fri, 3 Mar 2017 18:59:06 +0100
Subject: [PATCH 08/29] BUG: fix ufuncs for bool dtype

Numpy does some strange things with bools in ufuncs, we just
rely on C behavior. The thus failing tests are filtered
out.
Other tests for corner cases fail due to non-obvious handling
of zero division and upcasting in Numpy, they're filtered
out, too.
---
 pygpu/tests/test_ufuncs.py | 62 +++++++++++++++++++++++++++++++++++---
 pygpu/ufuncs.py            | 25 ++++++++++++---
 2 files changed, 79 insertions(+), 8 deletions(-)

diff --git a/pygpu/tests/test_ufuncs.py b/pygpu/tests/test_ufuncs.py
index 69937b5bd5..bc8b453801 100644
--- a/pygpu/tests/test_ufuncs.py
+++ b/pygpu/tests/test_ufuncs.py
@@ -118,10 +118,17 @@ def check_reduction(reduction, dtype, axis, keepdims):
                               atol=rtol)
 
 
+fail_unary = [('reciprocal', bool),  # Numpy maps to logical_not.. why?
+              ('sign', bool),  # no valid signature
+              ('fmod', bool)  # wrong result where second array is 0
+              ]
+
+
 def test_unary_ufuncs():
     for ufunc in UNARY_UFUNCS:
         for dtype in dtypes:
-            yield check_unary_ufunc, ufunc, dtype
+            if (ufunc, dtype) not in fail_unary:
+                yield check_unary_ufunc, ufunc, dtype
 
 
 def check_unary_ufunc(ufunc, dtype):
@@ -162,20 +169,67 @@ def check_unary_ufunc(ufunc, dtype):
                               equal_nan=True)
 
 
+sint_dtypes = [dt for dt in NAME_TO_DTYPE.values()
+               if numpy.issubsctype(dt, numpy.signedinteger)]
+uint_dtypes = [dt for dt in NAME_TO_DTYPE.values()
+               if numpy.issubsctype(dt, numpy.unsignedinteger)]
+int_dtypes = sint_dtypes + uint_dtypes
+float_dtypes = [dt for dt in NAME_TO_DTYPE.values()
+                if numpy.issubsctype(dt, numpy.floating)]
+# List of failing combinations.
+# If a dtype key is not present, it is equivalent to "all dtypes".
+# Reasons for failure:
+# - fmod: division by zero handled differently
+# - nextafter: for bool, numpy upcasts to float16 only, we cast to float32
+# - power: negative numbers to the `True`-th power are too large by 1
+# - left_shift: numpy wraps around (True << -3 is huge), our code doesn't
+# - floor_divide: division by zero handled differently
+# - logical_xor: our code yields True for True ^ 0.1 due to rounding
+# - remainder: division by zero handled differently
+fail_binary = [{'ufunc': 'fmod', 'dtype2': [bool]},  # wrong where array2 is 0
+               {'ufunc': 'nextafter', 'dtype1': [bool], 'dtype2': int_dtypes},
+               {'ufunc': 'left_shift', 'dtype1': sint_dtypes},
+               {'ufunc': 'left_shift', 'dtype2': sint_dtypes},
+               {'ufunc': 'floor_divide', 'dtype2': [bool]},
+               {'ufunc': 'logical_xor', 'dtype1': [bool],
+                'dtype2': float_dtypes},
+               {'ufunc': 'remainder', 'dtype2': [bool]},
+               ]
+
+
 def test_binary_ufuncs():
     for ufunc in BINARY_UFUNCS:
         for dtype1 in dtypes:
             for dtype2 in dtypes:
-                yield check_binary_ufunc, ufunc, dtype1, dtype2
+                if any(d['ufunc'] == ufunc and
+                       dtype1 in d.get('dtype1', [dtype1]) and
+                       dtype2 in d.get('dtype2', [dtype2])
+                       for d in fail_binary):
+                    pass
+                else:
+                    yield check_binary_ufunc, ufunc, dtype1, dtype2
 
             scalars = [1, 5]
             if ufunc not in ('fmod', 'remainder', 'floor_divide'):
                 scalars.append(0)
-            if not numpy.issubsctype(dtype1, numpy.unsignedinteger):
+            if (numpy.issubsctype(dtype1, numpy.unsignedinteger) or
+                    (ufunc == 'power' and
+                     (numpy.issubsctype(dtype1, numpy.integer) or
+                      dtype1 == bool))):
+                # Not appending negative scalars in this case.
+                # power: integers (including bool) to negative integer powers
+                # not valid
+                pass
+            else:
                 scalars += [-1, -2]
 
             for scalar in scalars:
-                yield check_binary_ufunc_scalar, ufunc, scalar, dtype1
+                if any(d['ufunc'] == ufunc and
+                       dtype1 in d.get('dtype2', [dtype2])  # this is correct
+                       for d in fail_binary):
+                    pass
+                else:
+                    yield check_binary_ufunc_scalar, ufunc, scalar, dtype1
 
 
 def check_binary_ufunc(ufunc, dtype1, dtype2):
diff --git a/pygpu/ufuncs.py b/pygpu/ufuncs.py
index 314192ae70..6791b82798 100644
--- a/pygpu/ufuncs.py
+++ b/pygpu/ufuncs.py
@@ -2,6 +2,7 @@
 
 import numpy
 import re
+import warnings
 from pygpu._array import ndgpuarray
 from pygpu.dtypes import dtype_to_ctype, NAME_TO_DTYPE
 from pygpu._elemwise import arg
@@ -141,6 +142,9 @@ def sum(a, axis=None, dtype=None, out=None, keepdims=False):
     --------
     numpy.sum
     """
+    # Do what Numpy does with booleans, sensible or not
+    if a.dtype == bool and dtype is None:
+        dtype = int
     return reduce_with_op(a, '+', 0, axis, dtype, out, keepdims)
 
 
@@ -247,7 +251,7 @@ def amin(a, axis=None, out=None, keepdims=False):
     elif numpy.issubsctype(a.dtype, numpy.integer):
         neutral = numpy.iinfo(a.dtype).max
     elif numpy.issubsctype(a.dtype, numpy.floating):
-        neutral = numpy.inf
+        neutral = 'INFINITY'
     elif numpy.issubsctype(a.dtype, numpy.complexfloating):
         raise ValueError('array dtype {!r} not comparable'
                          ''.format(a.dtype.name))
@@ -270,7 +274,7 @@ def amax(a, axis=None, out=None, keepdims=False):
     elif numpy.issubsctype(a.dtype, numpy.integer):
         neutral = numpy.iinfo(a.dtype).min
     elif numpy.issubsctype(a.dtype, numpy.floating):
-        neutral = -numpy.inf
+        neutral = '-INFINITY'
     elif numpy.issubsctype(a.dtype, numpy.complexfloating):
         raise ValueError('array dtype {!r} not comparable'
                          ''.format(a.dtype.name))
@@ -610,11 +614,24 @@ def unary_ufunc(a, ufunc_name, out=None):
     # Case 1: math function
     if ufunc_name in UNARY_UFUNC_TO_C_FUNC:
         c_func = ufunc_c_fname(ufunc_name, (a.dtype,))
-        oper = 'res = ({}) {}(a)'.format(c_res_dtype, c_func)
+        # Shortcut for abs() with unsigned int. This also fixes a CUDA quirk
+        # that makes abs() fail with unsigned ints.
+        if (ufunc_name == 'absolute' and
+                numpy.issubsctype(a.dtype, numpy.unsignedinteger)):
+            out[:] = a.copy()
+            return out
+        else:
+            oper = 'res = ({}) {}(a)'.format(c_res_dtype, c_func)
 
     # Case 2: unary operator
     unop = UNARY_UFUNC_TO_C_OP.get(ufunc_name, None)
     if unop is not None:
+        if a.dtype == numpy.bool and unop == '-':
+            warnings.warn('using negation (`-`) with boolean arrays is '
+                          'deprecated, use logical not (`~`) instead; '
+                          'the current behavior will be changed along with '
+                          "NumPy's", FutureWarning)
+            unop = '!'
         oper = 'res = ({}) {}a'.format(c_res_dtype, unop)
 
     # Other cases: specific functions
@@ -627,7 +644,7 @@ def unary_ufunc(a, ufunc_name, out=None):
                                                       rdt=c_res_dtype)
 
     if ufunc_name == 'reciprocal':
-        oper = 'res = ({}) (1.0) / a'.format(c_res_dtype)
+        oper = 'res = ({dt}) (({dt}) 1.0) / a'.format(dt=c_res_dtype)
 
     if ufunc_name == 'sign':
         oper = 'res = ({}) ((a > 0) ? 1 : (a < 0) ? -1 : 0)'.format(

From 55018f1ed20eb9f2c2cf747cc367785fcd7c3be5 Mon Sep 17 00:00:00 2001
From: Holger Kohr <h.kohr@cwi.nl>
Date: Mon, 17 Apr 2017 00:43:16 +0200
Subject: [PATCH 09/29] BUG: fix power function rounding issue

---
 pygpu/ufuncs.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pygpu/ufuncs.py b/pygpu/ufuncs.py
index 6791b82798..7d344a61d4 100644
--- a/pygpu/ufuncs.py
+++ b/pygpu/ufuncs.py
@@ -797,9 +797,8 @@ def binary_ufunc(a, b, ufunc_name, out=None):
         if ufunc_name == 'power':
             # Arguments to `pow` cannot be integer, need to cast
             if numpy.issubsctype(result_dtype, numpy.integer):
-                eps = 0.001
-                tpl = 'res = ({rt}) (long) (pow((double) a, (double) b) + {})'
-                oper = tpl.format(eps, rt=c_res_dtype)
+                tpl = 'res = ({rt}) (long) round(pow((double) a, (double) b))'
+                oper = tpl.format(rt=c_res_dtype)
             else:
                 oper = 'res = ({rt}) pow(({rt}) a, ({rt}) b)'.format(
                     rt=c_res_dtype)
@@ -873,7 +872,8 @@ def binary_ufunc(a, b, ufunc_name, out=None):
         raise ValueError('`ufunc_name` {!r} does not represent a binary '
                          'ufunc'.format(ufunc_name))
 
-    kernel = GpuElemwise(a.context, oper, args, convert_f16=True)
+    kernel = GpuElemwise(a.context, oper, args, convert_f16=True,
+                         preamble='')
     kernel(out, a, b, broadcast=True)
     return out
 

From 1dbe9cddf5cd117420f7551696b37ad0387f7672 Mon Sep 17 00:00:00 2001
From: Holger Kohr <h.kohr@cwi.nl>
Date: Mon, 17 Apr 2017 03:08:06 +0200
Subject: [PATCH 10/29] TST: fix or filter out tests of ufunc corner cases

---
 pygpu/tests/test_ufuncs.py | 144 +++++++++++++++++++++++++++----------
 1 file changed, 106 insertions(+), 38 deletions(-)

diff --git a/pygpu/tests/test_ufuncs.py b/pygpu/tests/test_ufuncs.py
index bc8b453801..2b51477c19 100644
--- a/pygpu/tests/test_ufuncs.py
+++ b/pygpu/tests/test_ufuncs.py
@@ -181,55 +181,92 @@ def check_unary_ufunc(ufunc, dtype):
 # Reasons for failure:
 # - fmod: division by zero handled differently
 # - nextafter: for bool, numpy upcasts to float16 only, we cast to float32
-# - power: negative numbers to the `True`-th power are too large by 1
+# - ldexp: numpy upcasts to float32, we to float64
 # - left_shift: numpy wraps around (True << -3 is huge), our code doesn't
 # - floor_divide: division by zero handled differently
 # - logical_xor: our code yields True for True ^ 0.1 due to rounding
 # - remainder: division by zero handled differently
-fail_binary = [{'ufunc': 'fmod', 'dtype2': [bool]},  # wrong where array2 is 0
-               {'ufunc': 'nextafter', 'dtype1': [bool], 'dtype2': int_dtypes},
-               {'ufunc': 'left_shift', 'dtype1': sint_dtypes},
-               {'ufunc': 'left_shift', 'dtype2': sint_dtypes},
-               {'ufunc': 'floor_divide', 'dtype2': [bool]},
-               {'ufunc': 'logical_xor', 'dtype1': [bool],
-                'dtype2': float_dtypes},
-               {'ufunc': 'remainder', 'dtype2': [bool]},
-               ]
+fail_binary = {'fmod': {'dtype2': [bool]},  # wrong where array2 is 0
+               'nextafter': {'dtype1': [bool], 'dtype2': int_dtypes},
+               'ldexp': {'dtype2': [numpy.uint16]},
+               'left_shift': {'dtype2': sint_dtypes},
+               'floor_divide': {'dtype2': [bool]},
+               'logical_xor': {'dtype1': [bool], 'dtype2': float_dtypes},
+               'remainder': {'dtype2': [bool]},
+               }
+# ufuncs where negative scalars trigger upcasting that differs from Numpy
+upcast_wrong_neg = ('copysign', 'ldexp', 'hypot', 'arctan2', 'nextafter')
 
 
 def test_binary_ufuncs():
     for ufunc in BINARY_UFUNCS:
         for dtype1 in dtypes:
             for dtype2 in dtypes:
-                if any(d['ufunc'] == ufunc and
-                       dtype1 in d.get('dtype1', [dtype1]) and
-                       dtype2 in d.get('dtype2', [dtype2])
-                       for d in fail_binary):
+                if (ufunc in fail_binary and
+                    dtype1 in fail_binary[ufunc].get('dtype1', [dtype1]) and
+                        dtype2 in fail_binary[ufunc].get('dtype2', [dtype2])):
                     pass
                 else:
                     yield check_binary_ufunc, ufunc, dtype1, dtype2
 
-            scalars = [1, 5]
-            if ufunc not in ('fmod', 'remainder', 'floor_divide'):
-                scalars.append(0)
-            if (numpy.issubsctype(dtype1, numpy.unsignedinteger) or
-                    (ufunc == 'power' and
-                     (numpy.issubsctype(dtype1, numpy.integer) or
-                      dtype1 == bool))):
-                # Not appending negative scalars in this case.
-                # power: integers (including bool) to negative integer powers
-                # not valid
-                pass
-            else:
-                scalars += [-1, -2]
-
-            for scalar in scalars:
-                if any(d['ufunc'] == ufunc and
-                       dtype1 in d.get('dtype2', [dtype2])  # this is correct
-                       for d in fail_binary):
-                    pass
-                else:
-                    yield check_binary_ufunc_scalar, ufunc, scalar, dtype1
+                scalars_left = [-2, -1, -0.5, 0, 1, 2.5, 4]
+                scalars_right = list(scalars_left)
+
+                # For scalars, we need to exclude some cases
+
+                # Obvious things first: no invalid scalars for the given dtype
+                if numpy.issubsctype(dtype1, numpy.integer):
+                    scalars_left = [x for x in scalars_left if int(x) == x]
+                if numpy.issubsctype(dtype2, numpy.integer):
+                    scalars_right = [x for x in scalars_right if int(x) == x]
+                if numpy.issubsctype(dtype1, numpy.unsignedinteger):
+                    scalars_left = [x for x in scalars_left if x >= 0]
+                if numpy.issubsctype(dtype2, numpy.unsignedinteger):
+                    scalars_right = [x for x in scalars_right if x >= 0]
+
+                # Special treatment for some ufuncs
+                if ufunc == 'power':
+                    if (numpy.issubsctype(dtype1, numpy.integer) or
+                            dtype1 == bool):
+                        # Negative integer power of integer is invalid
+                        scalars_right = [x for x in scalars_right if x >= 0]
+                    if numpy.issubsctype(dtype2, numpy.signedinteger):
+                        # Negative integer power of integer is invalid
+                        scalars_left = [x for x in scalars_right if x >= 0]
+                if ufunc in ('fmod', 'remainder', 'floor_divide'):
+                    # These ufuncs divide by the right operand, so we remove 0
+                    scalars_left = [x for x in scalars_left if x >= 0]
+                if ufunc == 'copysign':
+                    if numpy.issubsctype(dtype2, numpy.unsignedinteger):
+                        # Don't try to copy negative sign to unsigned array
+                        scalars_left = [x for x in scalars_left if x >= 0]
+                if ufunc == 'ldexp':
+                    # Only integers as second argument
+                    scalars_right = [x for x in scalars_right if int(x) == x]
+                    if numpy.issubsctype(dtype2, numpy.unsignedinteger):
+                        # This is not allowed due to invalid type coercion
+                        scalars_left = []
+
+                # Remove negative scalars for ufuncs with differing upcasting
+                if ufunc in upcast_wrong_neg:
+                    scalars_left = [x for x in scalars_left if x >= 0]
+                    scalars_right = [x for x in scalars_left if x >= 0]
+
+                for scalar in scalars_left:
+                    if (ufunc in fail_binary and
+                            fail_binary[ufunc].get('dtype2', [dtype2])):
+                        pass
+                    else:
+                        yield (check_binary_ufunc_scalar_left,
+                               ufunc, scalar, dtype2)
+
+                for scalar in scalars_right:
+                    if (ufunc in fail_binary and
+                            fail_binary[ufunc].get('dtype1', [dtype1])):
+                        pass
+                    else:
+                        yield (check_binary_ufunc_scalar_right,
+                               ufunc, scalar, dtype1)
 
 
 def check_binary_ufunc(ufunc, dtype1, dtype2):
@@ -243,7 +280,6 @@ def check_binary_ufunc(ufunc, dtype1, dtype2):
     npy_arr2, gpuary_arr2 = npy_and_gpuary_arrays(shape=(2, 3),
                                                   dtype=dtype2,
                                                   positive=True)
-
     try:
         res = numpy.finfo(numpy.result_type(dtype1, dtype2)).resolution
     except ValueError:
@@ -275,15 +311,47 @@ def check_binary_ufunc(ufunc, dtype1, dtype2):
                               equal_nan=True)
 
 
-def check_binary_ufunc_scalar(ufunc, scalar, dtype):
-    """Test GpuArray binary ufunc with scalar second operand."""
+def check_binary_ufunc_scalar_left(ufunc, scalar, dtype):
+    """Test GpuArray binary ufunc with scalar first operand."""
     gpuary_ufunc = getattr(ufuncs, ufunc)
     npy_ufunc = getattr(numpy, ufunc)
 
     npy_arr, gpuary_arr = npy_and_gpuary_arrays(shape=(2, 3),
                                                 dtype=dtype,
                                                 positive=True)
+    try:
+        res = numpy.finfo(dtype).resolution
+    except ValueError:
+        res = numpy.finfo(numpy.promote_types(dtype,
+                                              numpy.float16)).resolution
+    rtol = 10 * res
 
+    try:
+        npy_result = npy_ufunc(scalar, npy_arr)
+    except TypeError:
+        # Make sure we raise the same error as Numpy
+        with assert_raises(TypeError):
+            gpuary_ufunc(gpuary_arr, scalar)
+    else:
+        if npy_result.dtype == numpy.dtype('float16'):
+            # We use float32 as minimum for GPU arrays, do the same here
+            npy_result = npy_result.astype('float32')
+
+        gpuary_result = gpuary_ufunc(scalar, gpuary_arr)
+        assert npy_result.shape == gpuary_result.shape
+        assert npy_result.dtype == gpuary_result.dtype
+        assert numpy.allclose(npy_result, gpuary_result, rtol=rtol,
+                              atol=rtol, equal_nan=True)
+
+
+def check_binary_ufunc_scalar_right(ufunc, scalar, dtype):
+    """Test GpuArray binary ufunc with scalar second operand."""
+    gpuary_ufunc = getattr(ufuncs, ufunc)
+    npy_ufunc = getattr(numpy, ufunc)
+
+    npy_arr, gpuary_arr = npy_and_gpuary_arrays(shape=(2, 3),
+                                                dtype=dtype,
+                                                positive=True)
     try:
         res = numpy.finfo(dtype).resolution
     except ValueError:

From be936946d7af9420dd9e913b78caf58be7e84078 Mon Sep 17 00:00:00 2001
From: Holger Kohr <h.kohr@cwi.nl>
Date: Mon, 17 Apr 2017 01:44:51 +0200
Subject: [PATCH 11/29] ENH: use preamble for more complex ufuncs

---
 pygpu/ufuncs.py | 51 +++++++++++++++++++++++++++++++------------------
 1 file changed, 32 insertions(+), 19 deletions(-)

diff --git a/pygpu/ufuncs.py b/pygpu/ufuncs.py
index 7d344a61d4..c9b3929738 100644
--- a/pygpu/ufuncs.py
+++ b/pygpu/ufuncs.py
@@ -1,5 +1,6 @@
 ﻿"""Ufuncs and reductions for GPU arrays."""
 
+import mako
 import numpy
 import re
 import warnings
@@ -789,8 +790,9 @@ def binary_ufunc(a, b, ufunc_name, out=None):
     # C result dtype for casting
     c_res_dtype = dtype_to_ctype(result_dtype)
 
-    # Set string for mapping operation
+    # Set string for mapping operation and preamble for additional functions
     oper = ''
+    preamble = ''
 
     # Case 1: math function
     if ufunc_name in BINARY_UFUNC_TO_C_FUNC:
@@ -825,16 +827,19 @@ def binary_ufunc(a, b, ufunc_name, out=None):
         if ufunc_name == 'floor_divide':
             # implement as sign(a/b) * int(abs(a/b) + shift(a,b))
             # where shift(a,b) = 0 if sign(a) == sign(b) else 1 - epsilon
-            sign_part = '(((a < 0) != (b < 0)) ? -1 : 1)'
-            # TODO: this computes twice - better solution?
-            abs_div_part = ('(((double)a / b < 0) ? ' +
-                            '-(double)a / b : ' +
-                            '(double)a / b)')
-            shift_part = '(((a < 0) != (b < 0)) ? 0.999 : 0)'
-            oper = 'res = ({}) ({} * (long)({} + {}))'.format(c_res_dtype,
-                                                              sign_part,
-                                                              abs_div_part,
-                                                              shift_part)
+            preamble = '''
+            WITHIN_KERNEL long
+            floor_div_dbl(double a, double b) {
+                double quot = a / b;
+                if ((a < 0) != (b < 0)) {
+                    return - (long) (quot + 0.999);
+                } else {
+                    return (long) quot;
+                }
+            }
+            '''
+            oper = 'res = ({}) floor_div_dbl((double) a, (double) b)'.format(
+                c_res_dtype)
 
         if ufunc_name == 'true_divide':
             if result_dtype == numpy.dtype('float64'):
@@ -855,25 +860,33 @@ def binary_ufunc(a, b, ufunc_name, out=None):
 
         elif ufunc_name == 'remainder':
             # The same as `fmod` except for b < 0, where we have
-            # remainder(a,b) = fmod(a, b)
+            # remainder(a, b) = fmod(a, b) + b
             if numpy.issubsctype(result_dtype, numpy.integer):
                 cast_type = 'double'
             else:
                 cast_type = c_res_dtype
 
-            # TODO: this computes fmod(a,b) twice for b < 0, better solution?
-            mod_expr = 'fmod(({ct}) a, ({ct}) b)'.format(ct=cast_type)
-            comp_part = ('(((b < 0) && ({expr} != 0)) ? ' +
-                         '({ct})b + {expr} : ' +
-                         '{expr})').format(ct=cast_type, expr=mod_expr)
-            oper = 'res = ({rt}) {}'.format(comp_part, rt=c_res_dtype)
+            preamble = mako.template.Template('''
+            WITHIN_KERNEL ${ct}
+            rem(${ct} a, ${ct} b) {
+                ${ct} modval = fmod(a, b);
+                if (b < 0 && modval != 0) {
+                    return b + modval;
+                } else {
+                    return modval;
+                }
+            }
+            ''').render(ct=cast_type)
+            oper = 'res = ({rt}) rem(({ct}) a, ({ct}) b)'.format(
+                rt=c_res_dtype, ct=cast_type)
+            print(oper)
 
     if not oper:
         raise ValueError('`ufunc_name` {!r} does not represent a binary '
                          'ufunc'.format(ufunc_name))
 
     kernel = GpuElemwise(a.context, oper, args, convert_f16=True,
-                         preamble='')
+                         preamble=preamble)
     kernel(out, a, b, broadcast=True)
     return out
 

From a39d64aa799df4cee33ffc4773ba1f85b1695706 Mon Sep 17 00:00:00 2001
From: Holger Kohr <h.kohr@cwi.nl>
Date: Mon, 17 Apr 2017 03:04:19 +0200
Subject: [PATCH 12/29] ENH: add some more ufuncs

---
 pygpu/ufuncs.py | 43 ++++++++++++++++++++++++++++++++++---------
 1 file changed, 34 insertions(+), 9 deletions(-)

diff --git a/pygpu/ufuncs.py b/pygpu/ufuncs.py
index c9b3929738..5dedc0e935 100644
--- a/pygpu/ufuncs.py
+++ b/pygpu/ufuncs.py
@@ -328,7 +328,7 @@ def amax(a, axis=None, out=None, keepdims=False):
 UNARY_UFUNCS = (list(UNARY_UFUNC_TO_C_FUNC.keys()) +
                 list(UNARY_UFUNC_TO_C_OP.keys()))
 UNARY_UFUNCS.extend(['deg2rad', 'rad2deg', 'reciprocal', 'sign', 'signbit',
-                     'square'])
+                     'square', 'isinf', 'isfinite', 'spacing'])
 BINARY_C_FUNC_TO_UFUNC = {
     'atan2': 'arctan2',
     'copysign': 'copysign',
@@ -360,7 +360,8 @@ def amax(a, axis=None, out=None, keepdims=False):
 BINARY_UFUNCS = (list(BINARY_UFUNC_TO_C_FUNC.keys()) +
                  list(BINARY_UFUNC_TO_C_OP.keys()))
 BINARY_UFUNCS.extend(['floor_divide', 'true_divide', 'logical_xor',
-                      'maximum', 'minimum', 'remainder'])
+                      'maximum', 'minimum', 'remainder', 'logaddexp',
+                      'logaddexp2'])
 
 
 def ufunc_dtypes(ufunc_name, dtypes_in):
@@ -657,6 +658,27 @@ def unary_ufunc(a, ufunc_name, out=None):
     if ufunc_name == 'square':
         oper = 'res = ({}) (a * a)'.format(c_res_dtype)
 
+    if ufunc_name == 'isfinite':
+        # TODO: NaN part yields wrong value
+        oper = '''
+        res = ({}) (a != INFINITY && a != -INFINITY && a != NAN)
+        '''.format(c_res_dtype)
+
+    if ufunc_name == 'isinf':
+        oper = 'res = ({}) (a == INFINITY || a == -INFINITY)'.format(
+            c_res_dtype)
+
+    if ufunc_name == 'isnan':
+        # TODO: always returns False, fix NaN comparison in C
+        oper = 'res = ({}) (a == NAN)'.format(c_res_dtype)
+
+    if ufunc_name == 'spacing':
+        oper = '''
+        res = ({}) ((a < 0) ?
+                    nextafter(a, -INFINITY) - a :
+                    nextafter(a, INFINITY) - a)
+        '''.format(c_res_dtype)
+
     if not oper:
         raise ValueError('`ufunc_name` {!r} does not represent a unary ufunc'
                          ''.format(ufunc_name))
@@ -879,7 +901,14 @@ def binary_ufunc(a, b, ufunc_name, out=None):
             ''').render(ct=cast_type)
             oper = 'res = ({rt}) rem(({ct}) a, ({ct}) b)'.format(
                 rt=c_res_dtype, ct=cast_type)
-            print(oper)
+
+        elif ufunc_name == 'logaddexp':
+            oper = 'res = ({}) log(exp(a) + exp(b))'.format(c_res_dtype)
+
+        elif ufunc_name == 'logaddexp2':
+            oper = '''
+            res = ({}) log(exp(a * log(2.0)) + exp(b * log(2.0))) / log(2.0)
+            '''.format(c_res_dtype)
 
     if not oper:
         raise ValueError('`ufunc_name` {!r} does not represent a binary '
@@ -894,13 +923,9 @@ def binary_ufunc(a, b, ufunc_name, out=None):
 MISSING_UFUNCS = [
     'conjugate',  # no complex dtype yet
     'frexp',  # multiple output values, how to handle that?
-    'isfinite',  # how to test in C?
-    'isinf',  # how to test in C?
-    'isnan',  # how to test in C?
-    'logaddexp',  # not a one-liner (at least not in numpy)
-    'logaddexp2',  # not a one-liner (at least not in numpy)
+    'isfinite',  # check in C against NaN doesn't work properly
+    'isnan',  # check in C against NaN doesn't work properly
     'modf',  # multiple output values, how to handle that?
-    'spacing',  # implementation?
     ]
 
 

From 7d6fe8416af8304a7242df06c7b17aad61a9994c Mon Sep 17 00:00:00 2001
From: Holger Kohr <h.kohr@cwi.nl>
Date: Tue, 18 Apr 2017 09:59:03 +0200
Subject: [PATCH 13/29] ENH: implement two-out ufuncs and fix spacing

---
 pygpu/tests/test_ufuncs.py | 103 +++++++++++++++++-----
 pygpu/ufuncs.py            | 173 ++++++++++++++++++++++++++++++++++---
 2 files changed, 242 insertions(+), 34 deletions(-)

diff --git a/pygpu/tests/test_ufuncs.py b/pygpu/tests/test_ufuncs.py
index 2b51477c19..4c190dd07a 100644
--- a/pygpu/tests/test_ufuncs.py
+++ b/pygpu/tests/test_ufuncs.py
@@ -4,7 +4,7 @@
 import pygpu
 from pygpu.dtypes import NAME_TO_DTYPE
 from pygpu import ufuncs
-from pygpu.ufuncs import UNARY_UFUNCS, BINARY_UFUNCS
+from pygpu.ufuncs import UNARY_UFUNCS, UNARY_UFUNCS_TWO_OUT, BINARY_UFUNCS
 from pygpu.tests.support import context
 
 
@@ -86,14 +86,14 @@ def check_reduction(reduction, dtype, axis, keepdims):
     else:
         assert npy_result.shape == gpuary_result.shape
         assert npy_result.dtype == gpuary_result.dtype
-        assert numpy.allclose(npy_result, gpuary_result, rtol=rtol,
+        assert numpy.allclose(gpuary_result, npy_result, rtol=rtol,
                               atol=rtol)
 
     # With out array
     out = pygpu.empty(npy_result.shape, npy_result.dtype)
     gpuary_result = gpuary_reduction(gpuary_arr, axis=axis, out=out,
                                      keepdims=keepdims)
-    assert numpy.allclose(npy_result, out, rtol=rtol, atol=rtol)
+    assert numpy.allclose(out, npy_result, rtol=rtol, atol=rtol)
     assert out is gpuary_result
 
     # Explicit out dtype, supported by some reductions only
@@ -103,7 +103,7 @@ def check_reduction(reduction, dtype, axis, keepdims):
                                          dtype=out_dtype,
                                          keepdims=keepdims)
         assert gpuary_result.dtype == out_dtype
-        assert numpy.allclose(npy_result, out, rtol=rtol, atol=rtol)
+        assert numpy.allclose(out, npy_result, rtol=rtol, atol=rtol)
 
     # Using numpy arrays as input
     gpuary_result = gpuary_reduction(npy_arr, axis=axis, keepdims=keepdims)
@@ -114,20 +114,27 @@ def check_reduction(reduction, dtype, axis, keepdims):
     else:
         assert npy_result.shape == gpuary_result.shape
         assert npy_result.dtype == gpuary_result.dtype
-        assert numpy.allclose(npy_result, gpuary_result, rtol=rtol,
+        assert numpy.allclose(gpuary_result, npy_result, rtol=rtol,
                               atol=rtol)
 
 
-fail_unary = [('reciprocal', bool),  # Numpy maps to logical_not.. why?
-              ('sign', bool),  # no valid signature
-              ('fmod', bool)  # wrong result where second array is 0
-              ]
+# Dictionary of failing combinations.
+# Reasons for failure:
+# - reciprocal: Numpy ufunc is equivalent to logical_not for bool, no idea why
+# - sign: fails for Numpy, too, since there's no valid signature
+# - fmod: division by 0 handled differently
+# - spacing: Numpy upcasts to float16 for small int types, we can't do that yet
+fail_unary = {'reciprocal': [bool],
+              'sign': [bool],
+              'fmod': [bool],
+              'spacing': [bool, numpy.int8, numpy.uint8],
+              }
 
 
 def test_unary_ufuncs():
     for ufunc in UNARY_UFUNCS:
         for dtype in dtypes:
-            if (ufunc, dtype) not in fail_unary:
+            if dtype not in fail_unary.get(ufunc, []):
                 yield check_unary_ufunc, ufunc, dtype
 
 
@@ -159,16 +166,68 @@ def check_unary_ufunc(ufunc, dtype):
         gpuary_result = gpuary_ufunc(gpuary_arr)
         assert npy_result.shape == gpuary_result.shape
         assert npy_result.dtype == gpuary_result.dtype
-        assert numpy.allclose(npy_result, gpuary_result, rtol=rtol,
+        assert numpy.allclose(gpuary_result, npy_result, rtol=rtol,
                               atol=rtol, equal_nan=True)
 
         # In-place
         out = pygpu.empty(npy_result.shape, npy_result.dtype)
         gpuary_ufunc(gpuary_arr, out)
-        assert numpy.allclose(npy_result, gpuary_result, rtol=rtol,
+        assert numpy.allclose(out, npy_result, rtol=rtol,
                               equal_nan=True)
 
 
+def test_unary_ufuncs_two_out():
+    for ufunc in UNARY_UFUNCS_TWO_OUT:
+        for dtype in dtypes:
+            if dtype not in fail_unary.get(ufunc, []):
+                yield check_unary_ufunc_two_out, ufunc, dtype
+
+
+def check_unary_ufunc_two_out(ufunc, dtype):
+    """Test GpuArray unary ufuncs with two outputs against Numpy."""
+    gpuary_ufunc = getattr(ufuncs, ufunc)
+    npy_ufunc = getattr(numpy, ufunc)
+
+    npy_arr, gpuary_arr = npy_and_gpuary_arrays(shape=(2, 3), dtype=dtype,
+                                                positive=True)
+    try:
+        res = numpy.finfo(dtype).resolution
+    except ValueError:
+        res = numpy.finfo(numpy.promote_types(dtype,
+                                              numpy.float16)).resolution
+    rtol = 10 * res
+
+    try:
+        npy_result1, npy_result2 = npy_ufunc(npy_arr)
+    except TypeError:
+        # Make sure we raise the same error as Numpy
+        with assert_raises(TypeError):
+            gpuary_ufunc(gpuary_arr)
+    else:
+        if npy_result1.dtype == numpy.dtype('float16'):
+            # We use float32 as minimum for GPU arrays, do the same here
+            npy_result1 = npy_result1.astype('float32')
+        if npy_result2.dtype == numpy.dtype('float16'):
+            npy_result2 = npy_result2.astype('float32')
+
+        gpuary_result1, gpuary_result2 = gpuary_ufunc(gpuary_arr)
+        assert npy_result1.shape == gpuary_result1.shape
+        assert npy_result2.shape == gpuary_result2.shape
+        assert npy_result1.dtype == gpuary_result1.dtype
+        assert npy_result2.dtype == gpuary_result2.dtype
+        assert numpy.allclose(gpuary_result1, npy_result1, rtol=rtol,
+                              atol=rtol, equal_nan=True)
+        assert numpy.allclose(gpuary_result2, npy_result2, rtol=rtol,
+                              atol=rtol, equal_nan=True)
+
+        # In-place
+        out1 = pygpu.empty(npy_result1.shape, npy_result1.dtype)
+        out2 = pygpu.empty(npy_result2.shape, npy_result2.dtype)
+        gpuary_ufunc(gpuary_arr, out1, out2)
+        assert numpy.allclose(out1, npy_result1, rtol=rtol, equal_nan=True)
+        assert numpy.allclose(out2, npy_result2, rtol=rtol, equal_nan=True)
+
+
 sint_dtypes = [dt for dt in NAME_TO_DTYPE.values()
                if numpy.issubsctype(dt, numpy.signedinteger)]
 uint_dtypes = [dt for dt in NAME_TO_DTYPE.values()
@@ -176,16 +235,18 @@ def check_unary_ufunc(ufunc, dtype):
 int_dtypes = sint_dtypes + uint_dtypes
 float_dtypes = [dt for dt in NAME_TO_DTYPE.values()
                 if numpy.issubsctype(dt, numpy.floating)]
-# List of failing combinations.
+# Dictionary of failing combinations.
 # If a dtype key is not present, it is equivalent to "all dtypes".
 # Reasons for failure:
 # - fmod: division by zero handled differently
-# - nextafter: for bool, numpy upcasts to float16 only, we cast to float32
-# - ldexp: numpy upcasts to float32, we to float64
-# - left_shift: numpy wraps around (True << -3 is huge), our code doesn't
+# - nextafter: for bool, Numpy upcasts to float16 only, we cast to float32
+# - ldexp: Numpy upcasts to float32, we to float64
+# - left_shift: Numpy wraps around (True << -3 is huge), our code doesn't
 # - floor_divide: division by zero handled differently
 # - logical_xor: our code yields True for True ^ 0.1 due to rounding
 # - remainder: division by zero handled differently
+# - logaddexp: Numpy upcasts to float32, we to float64
+# - logaddexp2: Numpy upcasts to float32, we to float64
 fail_binary = {'fmod': {'dtype2': [bool]},  # wrong where array2 is 0
                'nextafter': {'dtype1': [bool], 'dtype2': int_dtypes},
                'ldexp': {'dtype2': [numpy.uint16]},
@@ -193,6 +254,8 @@ def check_unary_ufunc(ufunc, dtype):
                'floor_divide': {'dtype2': [bool]},
                'logical_xor': {'dtype1': [bool], 'dtype2': float_dtypes},
                'remainder': {'dtype2': [bool]},
+               'logaddexp': {'dtype2': [numpy.uint16]},
+               'logaddexp2': {'dtype2': [numpy.uint16]},
                }
 # ufuncs where negative scalars trigger upcasting that differs from Numpy
 upcast_wrong_neg = ('copysign', 'ldexp', 'hypot', 'arctan2', 'nextafter')
@@ -301,13 +364,13 @@ def check_binary_ufunc(ufunc, dtype1, dtype2):
         gpuary_result = gpuary_ufunc(gpuary_arr, gpuary_arr2)
         assert npy_result.shape == gpuary_result.shape
         assert npy_result.dtype == gpuary_result.dtype
-        assert numpy.allclose(npy_result, gpuary_result, rtol=rtol,
+        assert numpy.allclose(gpuary_result, npy_result, rtol=rtol,
                               atol=rtol, equal_nan=True)
 
         # In-place
         out = pygpu.empty(npy_result.shape, npy_result.dtype)
         gpuary_ufunc(gpuary_arr, gpuary_arr2, out)
-        assert numpy.allclose(npy_result, gpuary_result, rtol=rtol,
+        assert numpy.allclose(out, npy_result, rtol=rtol,
                               equal_nan=True)
 
 
@@ -340,7 +403,7 @@ def check_binary_ufunc_scalar_left(ufunc, scalar, dtype):
         gpuary_result = gpuary_ufunc(scalar, gpuary_arr)
         assert npy_result.shape == gpuary_result.shape
         assert npy_result.dtype == gpuary_result.dtype
-        assert numpy.allclose(npy_result, gpuary_result, rtol=rtol,
+        assert numpy.allclose(gpuary_result, npy_result, rtol=rtol,
                               atol=rtol, equal_nan=True)
 
 
@@ -373,5 +436,5 @@ def check_binary_ufunc_scalar_right(ufunc, scalar, dtype):
         gpuary_result = gpuary_ufunc(gpuary_arr, scalar)
         assert npy_result.shape == gpuary_result.shape
         assert npy_result.dtype == gpuary_result.dtype
-        assert numpy.allclose(npy_result, gpuary_result, rtol=rtol,
+        assert numpy.allclose(gpuary_result, npy_result, rtol=rtol,
                               atol=rtol, equal_nan=True)
diff --git a/pygpu/ufuncs.py b/pygpu/ufuncs.py
index 5dedc0e935..adc9f5a2bc 100644
--- a/pygpu/ufuncs.py
+++ b/pygpu/ufuncs.py
@@ -329,6 +329,7 @@ def amax(a, axis=None, out=None, keepdims=False):
                 list(UNARY_UFUNC_TO_C_OP.keys()))
 UNARY_UFUNCS.extend(['deg2rad', 'rad2deg', 'reciprocal', 'sign', 'signbit',
                      'square', 'isinf', 'isfinite', 'spacing'])
+UNARY_UFUNCS_TWO_OUT = ['frexp', 'modf']
 BINARY_C_FUNC_TO_UFUNC = {
     'atan2': 'arctan2',
     'copysign': 'copysign',
@@ -641,43 +642,49 @@ def unary_ufunc(a, ufunc_name, out=None):
         oper = 'res = ({rdt})({}) * ({rdt}) a'.format(numpy.deg2rad(1),
                                                       rdt=c_res_dtype)
 
-    if ufunc_name == 'rad2deg':
+    elif ufunc_name == 'rad2deg':
         oper = 'res = ({rdt})({}) * ({rdt}) a'.format(numpy.rad2deg(1),
                                                       rdt=c_res_dtype)
 
-    if ufunc_name == 'reciprocal':
+    elif ufunc_name == 'reciprocal':
         oper = 'res = ({dt}) (({dt}) 1.0) / a'.format(dt=c_res_dtype)
 
-    if ufunc_name == 'sign':
+    elif ufunc_name == 'sign':
         oper = 'res = ({}) ((a > 0) ? 1 : (a < 0) ? -1 : 0)'.format(
             c_res_dtype)
 
-    if ufunc_name == 'signbit':
+    elif ufunc_name == 'signbit':
         oper = 'res = ({}) (a < 0)'.format(c_res_dtype)
 
-    if ufunc_name == 'square':
+    elif ufunc_name == 'square':
         oper = 'res = ({}) (a * a)'.format(c_res_dtype)
 
-    if ufunc_name == 'isfinite':
+    elif ufunc_name == 'isfinite':
         # TODO: NaN part yields wrong value
         oper = '''
         res = ({}) (a != INFINITY && a != -INFINITY && a != NAN)
         '''.format(c_res_dtype)
 
-    if ufunc_name == 'isinf':
+    elif ufunc_name == 'isinf':
         oper = 'res = ({}) (a == INFINITY || a == -INFINITY)'.format(
             c_res_dtype)
 
-    if ufunc_name == 'isnan':
+    elif ufunc_name == 'isnan':
         # TODO: always returns False, fix NaN comparison in C
         oper = 'res = ({}) (a == NAN)'.format(c_res_dtype)
 
-    if ufunc_name == 'spacing':
+    elif ufunc_name == 'spacing':
+        if numpy.issubsctype(a.dtype, numpy.integer):
+            # TODO: float16 as soon as it is properly supported
+            cast_dtype = numpy.result_type(a.dtype, numpy.float32)
+            c_cast_dtype = dtype_to_ctype(cast_dtype)
+        else:
+            c_cast_dtype = dtype_to_ctype(a.dtype)
         oper = '''
         res = ({}) ((a < 0) ?
-                    nextafter(a, -INFINITY) - a :
-                    nextafter(a, INFINITY) - a)
-        '''.format(c_res_dtype)
+                    nextafter(({ct}) a, ({ct}) a - 1) - a :
+                    nextafter(({ct}) a, ({ct}) a + 1) - a)
+        '''.format(c_res_dtype, ct=c_cast_dtype)
 
     if not oper:
         raise ValueError('`ufunc_name` {!r} does not represent a unary ufunc'
@@ -920,12 +927,126 @@ def binary_ufunc(a, b, ufunc_name, out=None):
     return out
 
 
+def unary_ufunc_two_out(a, ufunc_name, out1=None, out2=None):
+    """Call a unary ufunc with two outputs on an array ``a``.
+
+    Parameters
+    ----------
+    a : `array-like`
+        Input array.
+    ufunc_name : str
+        Name of the NumPy ufunc to be called on ``a``.
+    out1, out2 : `pygpu.gpuarray.GpuArray`, optional
+        Arrays in which to store the result. Their shape must be equal to
+        ``a.shape`` and their dtype must be the result dtypes of the
+        called function.
+        The arrays ``out1`` and ``out2`` must either both be provided,
+        or none of them. If both are ``None`` and ``a`` is not a
+        GPU array, a default GPU context must have been set.
+
+    Returns
+    -------
+    out1, out2 : `pygpu.gpuarray.GpuArray`
+        Results of the computation. If ``out1``, ``out2`` were given,
+        the returned object is a reference to it.
+        If ``out1`` and ``out2`` are ``None`` and ``a`` is not a GPU array,
+        the result arrays ``out1`` and ``out2`` are instances of
+        `pygpu._array.ndgpuarray`.
+
+    See Also
+    --------
+    ufunc_result_dtype : Get dtype of a ufunc result.
+    ufunc_c_funcname : Get C name for math ufuncs.
+    """
+    # Get a context and an array class to work with. Use the "highest"
+    # class present in the inputs.
+    need_context = True
+    ctx = None
+    cls = None
+    for ary in (a, out1, out2):
+        if isinstance(ary, GpuArray):
+            if ctx is not None and ary.context != ctx:
+                raise ValueError('cannot mix contexts')
+            ctx = ary.context
+            if cls is None or cls == GpuArray:
+                cls = ary.__class__
+            need_context = False
+
+    if need_context:
+        ctx = get_default_context()
+        cls = ndgpuarray  # TODO: sensible choice as default?
+
+    # TODO: can CPU memory handed directly to kernels?
+    if not isinstance(a, GpuArray):
+        a = numpy.asarray(a)
+        if a.flags.f_contiguous and not a.flags.c_contiguous:
+            order = 'F'
+        else:
+            order = 'C'
+        a = array(a, dtype=a.dtype, copy=False, order=order, context=ctx,
+                  cls=cls)
+
+    if a.dtype == numpy.dtype('float16'):
+        # Gives wrong results currently, see
+        # https://github.com/Theano/libgpuarray/issues/316
+        raise NotImplementedError('float16 currently broken')
+
+    prom_dtypes_in, result_dtypes = ufunc_dtypes(ufunc_name, [a.dtype])
+    prom_dtype_in = prom_dtypes_in[0]
+    result_dtype1, result_dtype2 = result_dtypes
+
+    # This is the "fallback signature" case, for us it signals failure.
+    # TypeError is what Numpy raises, too, which is kind of wrong
+    if prom_dtype_in == numpy.dtype(object):
+        raise TypeError('input dtype {!r} invalid for ufunc {!r}'
+                        ''.format(a.dtype.name, ufunc_name))
+
+    # Convert input such that the kernel runs
+    # TODO: can this be avoided?
+    a = a.astype(prom_dtype_in, copy=False)
+
+    if out1 is None:
+        out1 = empty(a.shape, dtype=result_dtype1, context=ctx, cls=cls)
+    else:
+        # TODO: allow larger dtype
+        if out1.dtype != result_dtype1:
+            raise ValueError('`out1.dtype` != result dtype: {!r} != {!r}'
+                             ''.format(out1.dtype.name, result_dtype1.name))
+    if out2 is None:
+        out2 = empty(a.shape, dtype=result_dtype2, context=ctx, cls=cls)
+    else:
+        if out2.dtype != result_dtype2:
+            raise ValueError('`out2.dtype` != result dtype: {!r} != {!r}'
+                             ''.format(out2.dtype.name, result_dtype2.name))
+
+    # C result dtypes for casting
+    c_res_dtype1 = dtype_to_ctype(result_dtype1)
+
+    oper = ''
+
+    if ufunc_name == 'frexp':
+        oper = 'res = ({rdt1}) frexp(a, &out2)'.format(rdt1=c_res_dtype1)
+    elif ufunc_name == 'modf':
+        oper = 'res = ({rdt1}) modf(a, &out2)'.format(rdt1=c_res_dtype1)
+
+    if not oper:
+        raise ValueError('`ufunc_name` {!r} does not represent a unary ufunc'
+                         ''.format(ufunc_name))
+
+    a_arg = as_argument(a, 'a', read=True)
+    args = [arg('res', out1.dtype, write=True),
+            arg('out2', out2.dtype, write=True),
+            a_arg]
+
+    ker = GpuElemwise(ctx, oper, args, convert_f16=True)
+    ker(out1, out2, a)
+    return out1, out2
+
+
 MISSING_UFUNCS = [
     'conjugate',  # no complex dtype yet
-    'frexp',  # multiple output values, how to handle that?
     'isfinite',  # check in C against NaN doesn't work properly
     'isnan',  # check in C against NaN doesn't work properly
-    'modf',  # multiple output values, how to handle that?
     ]
 
 
@@ -969,6 +1090,30 @@ def wrapper(a, out=None):
     globals()[ufunc_name] = make_unary_ufunc(ufunc_name, doc)
 
 
+def make_unary_ufunc_two_out(name, doc):
+    def wrapper(a, out1=None, out2=None):
+        return unary_ufunc_two_out(a, name, out1, out2)
+    wrapper.__qualname__ = wrapper.__name__ = name
+    wrapper.__doc__ = doc
+    return wrapper
+
+
+# Add the ufuncs to the module dictionary
+for ufunc_name in UNARY_UFUNCS_TWO_OUT:
+    npy_ufunc = getattr(numpy, ufunc_name)
+    descr = npy_ufunc.__doc__.splitlines()[2]
+    # Numpy occasionally uses single ticks for doc, we only use them for links
+    descr = re.sub('`+', '``', descr)
+    doc = descr + """
+
+See Also
+--------
+numpy.{}
+""".format(ufunc_name)
+
+    globals()[ufunc_name] = make_unary_ufunc_two_out(ufunc_name, doc)
+
+
 def make_binary_ufunc(name, doc):
     def wrapper(a, b, out=None):
         return binary_ufunc(a, b, name, out)

From 994eaebd13a13d052d1fb5f987ab857652c97e06 Mon Sep 17 00:00:00 2001
From: Holger Kohr <h.kohr@cwi.nl>
Date: Wed, 19 Apr 2017 16:59:22 +0200
Subject: [PATCH 14/29] BUG: fix NaN comparison in ufuncs

---
 pygpu/ufuncs.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/pygpu/ufuncs.py b/pygpu/ufuncs.py
index adc9f5a2bc..c3b58b4d94 100644
--- a/pygpu/ufuncs.py
+++ b/pygpu/ufuncs.py
@@ -328,7 +328,7 @@ def amax(a, axis=None, out=None, keepdims=False):
 UNARY_UFUNCS = (list(UNARY_UFUNC_TO_C_FUNC.keys()) +
                 list(UNARY_UFUNC_TO_C_OP.keys()))
 UNARY_UFUNCS.extend(['deg2rad', 'rad2deg', 'reciprocal', 'sign', 'signbit',
-                     'square', 'isinf', 'isfinite', 'spacing'])
+                     'square', 'isinf', 'isnan', 'isfinite', 'spacing'])
 UNARY_UFUNCS_TWO_OUT = ['frexp', 'modf']
 BINARY_C_FUNC_TO_UFUNC = {
     'atan2': 'arctan2',
@@ -660,9 +660,8 @@ def unary_ufunc(a, ufunc_name, out=None):
         oper = 'res = ({}) (a * a)'.format(c_res_dtype)
 
     elif ufunc_name == 'isfinite':
-        # TODO: NaN part yields wrong value
         oper = '''
-        res = ({}) (a != INFINITY && a != -INFINITY && a != NAN)
+        res = ({}) (a != INFINITY && a != -INFINITY && !isnan(a))
         '''.format(c_res_dtype)
 
     elif ufunc_name == 'isinf':
@@ -670,8 +669,7 @@ def unary_ufunc(a, ufunc_name, out=None):
             c_res_dtype)
 
     elif ufunc_name == 'isnan':
-        # TODO: always returns False, fix NaN comparison in C
-        oper = 'res = ({}) (a == NAN)'.format(c_res_dtype)
+        oper = 'res = ({}) (abs(isnan(a)))'.format(c_res_dtype)
 
     elif ufunc_name == 'spacing':
         if numpy.issubsctype(a.dtype, numpy.integer):
@@ -1045,8 +1043,6 @@ def unary_ufunc_two_out(a, ufunc_name, out1=None, out2=None):
 
 MISSING_UFUNCS = [
     'conjugate',  # no complex dtype yet
-    'isfinite',  # check in C against NaN doesn't work properly
-    'isnan',  # check in C against NaN doesn't work properly
     ]
 
 

From fa04da359bc8d00ed0f0e9ccc5eeb9f59e758042 Mon Sep 17 00:00:00 2001
From: Holger Kohr <h.kohr@cwi.nl>
Date: Fri, 2 Jun 2017 14:32:41 +0200
Subject: [PATCH 15/29] BUG: fix binary ufunc for non-array second op

---
 pygpu/ufuncs.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/pygpu/ufuncs.py b/pygpu/ufuncs.py
index c3b58b4d94..e9066cfa39 100644
--- a/pygpu/ufuncs.py
+++ b/pygpu/ufuncs.py
@@ -765,9 +765,13 @@ def binary_ufunc(a, b, ufunc_name, out=None):
             else:
                 b = numpy.asarray(b, dtype=numpy.result_type(a, b))
 
-        if b.flags.f_contiguous and not b.flags.c_contiguous:
-            order = 'F'
+        if isinstance(b, numpy.ndarray):
+            if b.flags.f_contiguous and not b.flags.c_contiguous:
+                order = 'F'
+            else:
+                order = 'C'
         else:
+            b = numpy.asarray(b)
             order = 'C'
         b = array(b, dtype=b.dtype, copy=False, order=order, context=ctx,
                   cls=cls)

From 61b75cf2e48e01c61e3934404afd979554d2a2f1 Mon Sep 17 00:00:00 2001
From: Holger Kohr <h.kohr@cwi.nl>
Date: Mon, 5 Jun 2017 02:19:35 +0200
Subject: [PATCH 16/29] ENH: make ufuncs classes

---
 pygpu/ufuncs.py | 249 +++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 215 insertions(+), 34 deletions(-)

diff --git a/pygpu/ufuncs.py b/pygpu/ufuncs.py
index e9066cfa39..a47e0167f3 100644
--- a/pygpu/ufuncs.py
+++ b/pygpu/ufuncs.py
@@ -340,24 +340,29 @@ def amax(a, axis=None, out=None, keepdims=False):
     'fmod': 'fmod',
 }
 BINARY_UFUNC_TO_C_FUNC = {v: k for k, v in BINARY_C_FUNC_TO_UFUNC.items()}
-BINARY_UFUNC_TO_C_OP = {
+BINARY_UFUNC_TO_C_BINOP = {
     'add': '+',
     'bitwise_and': '&',
     'bitwise_or': '|',
     'bitwise_xor': '^',
-    'equal': '==',
-    'greater': '>',
-    'greater_equal': '>=',
     'left_shift': '<<',
-    'less': '<',
-    'less_equal': '<=',
     'logical_and': '&&',
     'logical_or': '||',
     'multiply': '*',
-    'not_equal': '!=',
     'right_shift': '>>',
     'subtract': '-',
     }
+BINARY_UFUNC_TO_C_CMP = {
+    'equal': '==',
+    'greater': '>',
+    'greater_equal': '>=',
+    'less': '<',
+    'less_equal': '<=',
+    'not_equal': '!=',
+    }
+BINARY_UFUNC_TO_C_OP = {}
+BINARY_UFUNC_TO_C_OP.update(BINARY_UFUNC_TO_C_BINOP)
+BINARY_UFUNC_TO_C_OP.update(BINARY_UFUNC_TO_C_CMP)
 BINARY_UFUNCS = (list(BINARY_UFUNC_TO_C_FUNC.keys()) +
                  list(BINARY_UFUNC_TO_C_OP.keys()))
 BINARY_UFUNCS.extend(['floor_divide', 'true_divide', 'logical_xor',
@@ -1063,6 +1068,191 @@ def unary_ufunc_two_out(a, ufunc_name, out1=None, out2=None):
     ]
 
 
+# --- Reductions from binary ufuncs --- #
+
+
+def make_binary_ufunc_reduce(name):
+
+    npy_ufunc = getattr(numpy, name)
+
+    binop = BINARY_UFUNC_TO_C_BINOP.get(name, None)
+    if binop is not None:
+
+        def reduce_wrapper(a, axis=0, dtype=None, out=None, keepdims=False):
+            return reduce_with_op(a, binop, npy_ufunc.identity, axis, dtype,
+                                  out, keepdims)
+
+        return reduce_wrapper
+
+    cmp = BINARY_UFUNC_TO_C_CMP.get(name, None)
+    if cmp is not None:
+
+        def reduce_wrapper(a, axis=0, dtype=None, out=None, keepdims=False):
+            return reduce_with_cmp(a, cmp, npy_ufunc.identity, axis, dtype,
+                                   out, keepdims)
+
+        return reduce_wrapper
+
+    # TODO: add reduction with binary function, not possible currently since
+    # no sensible "neutral" can be defined. We need a variant of `reduce1`
+    # that initializes the accumulator with the first element along a given
+    # reduction axis.
+    return None
+
+
+# --- Add (incomplete) ufunc class --- #
+
+
+# TODO: find out how to use one Ufunc class with __call__ signature depending
+# on nin and nout. What doesn't work:
+# - Setting __call__ on the class in __new__ since later instantiations of
+#   the class overwrite previous ones (only one class object)
+#   object will not be registered as callable
+# - Setting __call__ on an instance (in __new__ or __init__), since the
+#   object will not be registered as callable
+# - Metaclass using __call__, need to find a more complete reference
+#
+# We need some way of setting __call__ on the class, but making one copy of
+# the class per call signature. Not sure how that makes any sense though...
+
+
+class UfuncBase(object):
+
+    def __init__(self, name, nin, nout, call, **kwargs):
+        self.name = name
+        self.nin = nin
+        self.nout = nout
+        self.nargs = self.nin + self.nout
+        self._call = call
+
+        self.identity = kwargs.pop('identity', None)
+
+        # Wrappers for numpy methods that take care of the array conversions
+
+        def _at_not_impl(self, a, indices, b=None):
+            """Not implemented."""
+            raise NotImplementedError
+
+        def _accumulate_wrapper(array, axis=0, dtype=None, out=None,
+                                keepdims=None):
+            if out is not None:
+                out_ndarray = numpy.asarray(out)
+                out_ndarray = self._npy_accumulate(array, axis, dtype,
+                                                   out_ndarray, keepdims)
+                out[:] = out_ndarray
+            else:
+                ctx = getattr(array, 'context', None)
+                out_ndarray = self._npy_accumulate(array, axis, dtype,
+                                                   None, keepdims)
+                out = array(out_ndarray, context=ctx, cls=ndgpuarray)
+
+            return out
+
+        def _outer_wrapper(A, B, **kwargs):
+            out = kwargs.pop('out', None)
+            if out is not None:
+                out_ndarray = numpy.asarray(out)
+                out_ndarray = self._npy_outer(A, B, out=out_ndarray, **kwargs)
+                out[:] = out_ndarray
+            else:
+                ctx = getattr(array, 'context', None)
+                out_ndarray = self._npy_outer(A, B, **kwargs)
+                out = array(out_ndarray, context=ctx, cls=ndgpuarray)
+
+            return out
+
+        def _reduce_wrapper(a, axis=0, dtype=None, out=None,
+                            keepdims=False):
+            if out is not None:
+                out_ndarray = numpy.asarray(out)
+                out_ndarray = self._npy_reduce(a, axis, dtype, out_ndarray,
+                                               keepdims)
+                out[:] = out_ndarray
+            else:
+                ctx = getattr(a, 'context', None)
+                out_ndarray = self._npy_reduce(a, axis, dtype, None,
+                                               keepdims)
+                if numpy.isscalar(out_ndarray):
+                    out = out_ndarray
+                else:
+                    out = array(out_ndarray, context=ctx, cls=ndgpuarray)
+
+            return out
+
+        def _reduceat_wrapper(a, indices, axis=0, dtype=None, out=None):
+            if out is not None:
+                out_ndarray = numpy.asarray(out)
+                out_ndarray = self._npy_reduceat(a, indices, axis, dtype,
+                                                 out_ndarray)
+                out[:] = out_ndarray
+            else:
+                ctx = getattr(a, 'context', None)
+                out_ndarray = self._npy_reduceat(a, indices, axis, dtype,
+                                                 None)
+                if numpy.isscalar(out_ndarray):
+                    out = out_ndarray
+                else:
+                    out = array(out_ndarray, context=ctx, cls=ndgpuarray)
+
+            return out
+
+        numpy_ufunc = getattr(numpy, name)
+
+        self.accumulate = kwargs.pop('accumulate', _accumulate_wrapper)
+        self.accumulate.__name__ = self.accumulate.__qualname__ = 'accumulate'
+        self._npy_accumulate = numpy_ufunc.accumulate
+
+        self.at = kwargs.pop('at', _at_not_impl)
+        self.at.__qualname__ = name + '.at'
+        self.at.__name__ = 'at'
+
+        self.outer = kwargs.pop('outer', _outer_wrapper)
+        self.outer.__name__ = 'outer'
+        self.outer.__qualname__ = name + '.outer'
+        self._npy_outer = numpy_ufunc.outer
+
+        reduce = kwargs.pop('reduce', None)
+        if reduce is None:
+            self.reduce = _reduce_wrapper
+        else:
+            self.reduce = reduce
+        self.reduce.__name__ = 'reduce'
+        self.reduce.__qualname__ = name + '.reduce'
+        self._npy_reduce = numpy_ufunc.reduce
+
+        self.reduceat = kwargs.pop('reduceat', _reduceat_wrapper)
+        self.reduceat.__name__ = 'reduceat'
+        self.reduceat.__qualname__ = name + '.reduceat'
+        self._npy_reduceat = numpy_ufunc.reduceat
+
+    def __repr__(self):
+        return '<ufunc {}>'.format(self.name)
+
+
+class Ufunc01(UfuncBase):
+
+    def __call__(self, out=None):
+        return self._call(out=out)
+
+
+class Ufunc11(UfuncBase):
+
+    def __call__(self, x, out=None):
+        return self._call(x, out=out)
+
+
+class Ufunc12(UfuncBase):
+
+    def __call__(self, x, out1=None, out2=None):
+        return self._call(x, out1=out1, out2=out2)
+
+
+class Ufunc21(UfuncBase):
+
+    def __call__(self, x1, x2, out=None):
+        return self._call(x1, x2, out=out)
+
+
 # --- Add ufuncs to global namespace --- #
 
 
@@ -1077,17 +1267,15 @@ def wrapper(a, out=None):
 # Add the ufuncs to the module dictionary
 for ufunc_name in UNARY_UFUNCS:
     npy_ufunc = getattr(numpy, ufunc_name)
+    assert npy_ufunc.nin == 1
+    assert npy_ufunc.nout == 1
     descr = npy_ufunc.__doc__.splitlines()[2]
     # Numpy occasionally uses single ticks for doc, we only use them for links
-    descr = re.sub('`+', '``', descr)
-    doc = descr + """
-
-See Also
---------
-numpy.{}
-""".format(ufunc_name)
-
-    globals()[ufunc_name] = make_unary_ufunc(ufunc_name, doc)
+    doc = re.sub('`+', '``', descr)
+    ufunc = Ufunc11(ufunc_name, nin=1, nout=1,
+                    call=make_unary_ufunc(ufunc_name, doc),
+                    identity=npy_ufunc.identity)
+    globals()[ufunc_name] = ufunc
 
 
 def make_unary_ufunc_two_out(name, doc):
@@ -1103,15 +1291,11 @@ def wrapper(a, out1=None, out2=None):
     npy_ufunc = getattr(numpy, ufunc_name)
     descr = npy_ufunc.__doc__.splitlines()[2]
     # Numpy occasionally uses single ticks for doc, we only use them for links
-    descr = re.sub('`+', '``', descr)
-    doc = descr + """
-
-See Also
---------
-numpy.{}
-""".format(ufunc_name)
-
-    globals()[ufunc_name] = make_unary_ufunc_two_out(ufunc_name, doc)
+    doc = re.sub('`+', '``', descr)
+    ufunc = Ufunc12(ufunc_name, nin=1, nout=2,
+                    call=make_unary_ufunc_two_out(ufunc_name, doc),
+                    identity=npy_ufunc.identity)
+    globals()[ufunc_name] = ufunc
 
 
 def make_binary_ufunc(name, doc):
@@ -1127,15 +1311,12 @@ def wrapper(a, b, out=None):
     npy_ufunc = getattr(numpy, ufunc_name)
     descr = npy_ufunc.__doc__.splitlines()[2]
     # Numpy occasionally uses single ticks for doc, we only use them for links
-    descr = re.sub('`+', '``', descr)
-    doc = descr + """
-
-See Also
---------
-numpy.{}
-""".format(ufunc_name)
-
-    globals()[ufunc_name] = make_binary_ufunc(ufunc_name, doc)
+    doc = re.sub('`+', '``', descr)
+    ufunc = Ufunc21(ufunc_name, nin=2, nout=1,
+                    call=make_binary_ufunc(ufunc_name, doc),
+                    identity=npy_ufunc.identity,
+                    reduce=make_binary_ufunc_reduce(ufunc_name))
+    globals()[ufunc_name] = ufunc
 
 
 for name, alt_name in UFUNC_SYNONYMS:

From d1c597744d94a5027db0237e542d379fdd35c98f Mon Sep 17 00:00:00 2001
From: Holger Kohr <h.kohr@cwi.nl>
Date: Wed, 7 Jun 2017 00:34:20 +0200
Subject: [PATCH 17/29] TST: add tests for ufunc.reduce

---
 pygpu/tests/test_ufuncs.py | 123 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 122 insertions(+), 1 deletion(-)

diff --git a/pygpu/tests/test_ufuncs.py b/pygpu/tests/test_ufuncs.py
index 4c190dd07a..51312ec9e3 100644
--- a/pygpu/tests/test_ufuncs.py
+++ b/pygpu/tests/test_ufuncs.py
@@ -4,7 +4,9 @@
 import pygpu
 from pygpu.dtypes import NAME_TO_DTYPE
 from pygpu import ufuncs
-from pygpu.ufuncs import UNARY_UFUNCS, UNARY_UFUNCS_TWO_OUT, BINARY_UFUNCS
+from pygpu.ufuncs import (
+    UNARY_UFUNCS, UNARY_UFUNCS_TWO_OUT, BINARY_UFUNCS, BINARY_UFUNC_TO_C_CMP,
+    BINARY_UFUNC_TO_C_BINOP, BINARY_UFUNC_TO_C_FUNC)
 from pygpu.tests.support import context
 
 
@@ -438,3 +440,122 @@ def check_binary_ufunc_scalar_right(ufunc, scalar, dtype):
         assert npy_result.dtype == gpuary_result.dtype
         assert numpy.allclose(gpuary_result, npy_result, rtol=rtol,
                               atol=rtol, equal_nan=True)
+
+
+def test_binary_ufuncs_reduce():
+    for ufunc in BINARY_UFUNC_TO_C_FUNC:
+        # Only one axis parameter can be used since it's not reorderable
+        for axis in [0, 1, 2]:
+            for keepdims in [True, False]:
+                if (ufunc in fail_binary and
+                    'float32' in fail_binary[ufunc].get('dtype1',
+                                                        ['float32']) and
+                        'float32' in fail_binary[ufunc].get('dtype2',
+                                                            ['float32'])):
+                    pass
+                else:
+                    yield check_binary_ufunc_reduce, ufunc, axis, keepdims
+
+    for ufunc in BINARY_UFUNC_TO_C_BINOP:
+        for axis in axis_params:
+            for keepdims in [True, False]:
+                if (ufunc in fail_binary and
+                    'float32' in fail_binary[ufunc].get('dtype1',
+                                                        ['float32']) and
+                        'float32' in fail_binary[ufunc].get('dtype2',
+                                                            ['float32'])):
+                    pass
+                elif ufunc.startswith('bitwise') or ufunc.endswith('shift'):
+                    # Invalid source
+                    pass
+                elif ufunc.startswith('logical'):
+                    # Wrong resulting dtype (float instead of bool)
+                    pass
+                elif ufunc in ('subtract',):
+                    # Not reorderable
+                    pass
+                else:
+                    yield check_binary_ufunc_reduce, ufunc, axis, keepdims
+
+    for ufunc in BINARY_UFUNC_TO_C_CMP:
+        for axis in axis_params:
+            for keepdims in [True, False]:
+                if (ufunc in fail_binary and
+                    'float32' in fail_binary[ufunc].get('dtype1',
+                                                        ['float32']) and
+                        'float32' in fail_binary[ufunc].get('dtype2',
+                                                            ['float32'])):
+                    pass
+                elif ufunc in ('equal', 'not_equal',
+                               'greater', 'greater_equal',
+                               'less', 'less_equal'):
+                    # Not reorderable
+                    pass
+                else:
+                    yield check_binary_ufunc_reduce, ufunc, axis, keepdims
+
+
+def check_binary_ufunc_reduce(ufunc, axis, keepdims):
+    """Test GpuArray ufunc.reduce against equivalent Numpy result."""
+    gpuary_ufunc = getattr(ufuncs, ufunc)
+    npy_ufunc = getattr(numpy, ufunc)
+
+    npy_arr, gpuary_arr = npy_and_gpuary_arrays(shape=(2, 3, 4),
+                                                dtype='float32')
+    # Determine relative tolerance from dtype
+    try:
+        res = numpy.finfo('float32').resolution
+    except ValueError:
+        res = 0
+    rtol = 2 * npy_arr.size * res
+
+    # No explicit out dtype
+    try:
+        npy_result = npy_ufunc.reduce(npy_arr, axis=axis, keepdims=keepdims)
+    except TypeError:
+        # Make sure missing signature produces a TypeError, then bail out
+        with assert_raises(TypeError):
+            gpuary_result = gpuary_ufunc.reduce(gpuary_arr, axis=axis,
+                                                keepdims=keepdims)
+        return
+
+    gpuary_result = gpuary_ufunc.reduce(gpuary_arr, axis=axis,
+                                        keepdims=keepdims)
+    if numpy.isscalar(npy_result):
+        assert numpy.isscalar(gpuary_result)
+        assert numpy.isclose(gpuary_result, npy_result, rtol=rtol,
+                             atol=rtol, equal_nan=True)
+    else:
+        assert npy_result.shape == gpuary_result.shape
+        assert npy_result.dtype == gpuary_result.dtype
+        assert numpy.allclose(gpuary_result, npy_result, rtol=rtol,
+                              atol=rtol, equal_nan=True)
+
+    # With out array
+    out = pygpu.empty(npy_result.shape, npy_result.dtype)
+    gpuary_result = gpuary_ufunc.reduce(gpuary_arr, axis=axis, out=out,
+                                        keepdims=keepdims)
+    assert numpy.allclose(out, npy_result, rtol=rtol, atol=rtol,
+                          equal_nan=True)
+    assert out is gpuary_result
+
+    # Explicit out dtype, supported by some reductions only
+    if ufunc in BINARY_UFUNC_TO_C_BINOP or ufunc in BINARY_UFUNC_TO_C_FUNC:
+        gpuary_result = gpuary_ufunc.reduce(gpuary_arr, axis=axis,
+                                            dtype='float64',
+                                            keepdims=keepdims)
+        assert gpuary_result.dtype == 'float64'
+        assert numpy.allclose(out, npy_result, rtol=rtol, atol=rtol,
+                              equal_nan=True)
+
+    # Using numpy arrays as input
+    gpuary_result = gpuary_ufunc.reduce(npy_arr, axis=axis, keepdims=keepdims)
+    if numpy.isscalar(npy_result):
+        assert numpy.isscalar(gpuary_result)
+        assert numpy.isclose(gpuary_result, npy_result, rtol=rtol,
+                             atol=rtol, equal_nan=True)
+    else:
+        assert npy_result.shape == gpuary_result.shape
+        assert npy_result.dtype == gpuary_result.dtype
+        assert numpy.allclose(gpuary_result, npy_result, rtol=rtol,
+                              atol=rtol, equal_nan=True)

From fccfeeab96b50535437c9a1beacddc62ae86cd8f Mon Sep 17 00:00:00 2001
From: Holger Kohr <h.kohr@cwi.nl>
Date: Mon, 5 Jun 2017 02:20:02 +0200
Subject: [PATCH 18/29] ENH: use new reductions on ndgpuarray

---
 pygpu/_array.py | 74 ++++++++++++-------------------------------------
 1 file changed, 18 insertions(+), 56 deletions(-)

diff --git a/pygpu/_array.py b/pygpu/_array.py
index 2fc793f838..c1de9ea03b 100644
--- a/pygpu/_array.py
+++ b/pygpu/_array.py
@@ -1,9 +1,11 @@
 from __future__ import division
+from functools import partial
 import numpy as np
 
 from .elemwise import elemwise1, elemwise2, ielemwise2, compare, arg, GpuElemwise, as_argument
 from .reduction import reduce1
 from .dtypes import dtype_to_ctype, get_np_obj, get_common_dtype
+from . import ufuncs
 from . import gpuarray
 
 
@@ -235,60 +237,20 @@ def fill(self, value):
         self[...] = value
 """
     # reductions
-    def all(self, axis=None, out=None):
-        if self.ndim == 0:
-            return self.copy()
-        return reduce1(self, '&&', '1', np.dtype('bool'),
-                       axis=axis, out=out)
+    def all(self, axis=None, out=None, keepdims=False):
+        return ufuncs.all(self, axis, out, keepdims)
 
-    def any(self, axis=None, out=None):
-        if self.ndim == 0:
-            return self.copy()
-        return reduce1(self, '||', '0', np.dtype('bool'),
-                       axis=axis, out=out)
-
-    def prod(self, axis=None, dtype=None, out=None):
-        if dtype is None:
-            dtype = self.dtype
-            # we only upcast integers that are smaller than the plaform default
-            if dtype.kind == 'i':
-                di = np.dtype('int')
-                if di.itemsize > dtype.itemsize:
-                    dtype = di
-            if dtype.kind == 'u':
-                di = np.dtype('uint')
-                if di.itemsize > dtype.itemsize:
-                    dtype = di
-        return reduce1(self, '*', '1', dtype, axis=axis, out=out)
-
-#    def max(self, axis=None, out=None);
-#        nd = self.ndim
-#        if nd == 0:
-#            return self.copy()
-#        idx = (0,) * nd
-#        n = str(self.__getitem__(idx).__array__())
-#        return reduce1(self, '', n, self.dtype, axis=axis, out=out,
-#                       oper='max(a, b)')
-
-#    def min(self, axis=None, out=None):
-#        nd = self.ndim
-#        if nd == 0:
-#            return self.copy()
-#        idx = (0,) * nd
-#        n = str(self.__getitem__(idx).__array__())
-#        return reduce1(self, '', n, self.dtype, axis=axis, out=out,
-#                       oper='min(a, b)')
-
-    def sum(self, axis=None, dtype=None, out=None):
-        if dtype is None:
-            dtype = self.dtype
-            # we only upcast integers that are smaller than the plaform default
-            if dtype.kind == 'i':
-                di = np.dtype('int')
-                if di.itemsize > dtype.itemsize:
-                    dtype = di
-            if dtype.kind == 'u':
-                di = np.dtype('uint')
-                if di.itemsize > dtype.itemsize:
-                    dtype = di
-        return reduce1(self, '+', '0', dtype, axis=axis, out=out)
+    def any(self, axis=None, out=None, keepdims=False):
+        return ufuncs.any(self, axis, out, keepdims)
+
+    def prod(self, axis=None, out=None, keepdims=False):
+        return ufuncs.prod(self, axis, out, keepdims)
+
+    def sum(self, axis=None, out=None, keepdims=False):
+        return ufuncs.sum(self, axis, out, keepdims)
+
+    def min(self, axis=None, out=None, keepdims=False):
+        return ufuncs.amin(self, axis, out, keepdims)
+
+    def max(self, axis=None, out=None, keepdims=False):
+        return ufuncs.amax(self, axis, out, keepdims)

From fed41eb88aae621a2e2d2a6ed143ccae8aeffd79 Mon Sep 17 00:00:00 2001
From: Holger Kohr <h.kohr@cwi.nl>
Date: Sun, 11 Jun 2017 02:46:52 +0200
Subject: [PATCH 19/29] MAINT: remove Numpy fallbacks in ufuncs

---
 pygpu/ufuncs.py | 100 ++++++++++++------------------------------------
 1 file changed, 25 insertions(+), 75 deletions(-)

diff --git a/pygpu/ufuncs.py b/pygpu/ufuncs.py
index a47e0167f3..cd36ce93d1 100644
--- a/pygpu/ufuncs.py
+++ b/pygpu/ufuncs.py
@@ -1050,8 +1050,14 @@ def unary_ufunc_two_out(a, ufunc_name, out1=None, out2=None):
     return out1, out2
 
 
+# TODO: add ufuncs conditionally depending on Numpy version?
 MISSING_UFUNCS = [
     'conjugate',  # no complex dtype yet
+    'float_power',  # new in Numpy 1.12
+    'divmod',  # new in Numpy 1.13
+    'heaviside',  # new in Numpy 1.13
+    'isnat',  # new in Numpy 1.13
+    'positive',  # new in Numpy 1.13
     ]
 
 
@@ -1112,8 +1118,8 @@ def reduce_wrapper(a, axis=0, dtype=None, out=None, keepdims=False):
 #   object will not be registered as callable
 # - Metaclass using __call__, need to find a more complete reference
 #
-# We need some way of setting __call__ on the class, but making one copy of
-# the class per call signature. Not sure how that makes any sense though...
+# We need some way of setting the __call__ signature of an instance,
+# probably by modifying __call__.__code__.
 
 
 class UfuncBase(object):
@@ -1127,103 +1133,47 @@ def __init__(self, name, nin, nout, call, **kwargs):
 
         self.identity = kwargs.pop('identity', None)
 
-        # Wrappers for numpy methods that take care of the array conversions
+        # Wrappers for unimplemented stuff
 
-        def _at_not_impl(self, a, indices, b=None):
-            """Not implemented."""
-            raise NotImplementedError
+        def _at_not_impl(a, indices, b=None):
+            return NotImplemented
 
-        def _accumulate_wrapper(array, axis=0, dtype=None, out=None,
-                                keepdims=None):
-            if out is not None:
-                out_ndarray = numpy.asarray(out)
-                out_ndarray = self._npy_accumulate(array, axis, dtype,
-                                                   out_ndarray, keepdims)
-                out[:] = out_ndarray
-            else:
-                ctx = getattr(array, 'context', None)
-                out_ndarray = self._npy_accumulate(array, axis, dtype,
-                                                   None, keepdims)
-                out = array(out_ndarray, context=ctx, cls=ndgpuarray)
-
-            return out
+        def _accumulate_not_impl(array, axis=0, dtype=None, out=None,
+                                 keepdims=None):
+            return NotImplemented
 
-        def _outer_wrapper(A, B, **kwargs):
-            out = kwargs.pop('out', None)
-            if out is not None:
-                out_ndarray = numpy.asarray(out)
-                out_ndarray = self._npy_outer(A, B, out=out_ndarray, **kwargs)
-                out[:] = out_ndarray
-            else:
-                ctx = getattr(array, 'context', None)
-                out_ndarray = self._npy_outer(A, B, **kwargs)
-                out = array(out_ndarray, context=ctx, cls=ndgpuarray)
+        def _outer_not_impl(A, B, **kwargs):
+            return NotImplemented
 
-            return out
-
-        def _reduce_wrapper(a, axis=0, dtype=None, out=None,
-                            keepdims=False):
-            if out is not None:
-                out_ndarray = numpy.asarray(out)
-                out_ndarray = self._npy_reduce(a, axis, dtype, out_ndarray,
-                                               keepdims)
-                out[:] = out_ndarray
-            else:
-                ctx = getattr(a, 'context', None)
-                out_ndarray = self._npy_reduce(a, axis, dtype, None,
-                                               keepdims)
-                if numpy.isscalar(out_ndarray):
-                    out = out_ndarray
-                else:
-                    out = array(out_ndarray, context=ctx, cls=ndgpuarray)
-
-            return out
-
-        def _reduceat_wrapper(a, indices, axis=0, dtype=None, out=None):
-            if out is not None:
-                out_ndarray = numpy.asarray(out)
-                out_ndarray = self._npy_reduceat(a, indices, axis, dtype,
-                                                 out_ndarray)
-                out[:] = out_ndarray
-            else:
-                ctx = getattr(a, 'context', None)
-                out_ndarray = self._npy_reduceat(a, indices, axis, dtype,
-                                                 None)
-                if numpy.isscalar(out_ndarray):
-                    out = out_ndarray
-                else:
-                    out = array(out_ndarray, context=ctx, cls=ndgpuarray)
-
-            return out
+        def _reduce_not_impl(a, axis=0, dtype=None, out=None,
+                             keepdims=False):
+            return NotImplemented
 
-        numpy_ufunc = getattr(numpy, name)
+        def _reduceat_not_impl(a, indices, axis=0, dtype=None, out=None):
+            return NotImplemented
 
-        self.accumulate = kwargs.pop('accumulate', _accumulate_wrapper)
+        self.accumulate = kwargs.pop('accumulate', _accumulate_not_impl)
         self.accumulate.__name__ = self.accumulate.__qualname__ = 'accumulate'
-        self._npy_accumulate = numpy_ufunc.accumulate
 
         self.at = kwargs.pop('at', _at_not_impl)
         self.at.__qualname__ = name + '.at'
         self.at.__name__ = 'at'
 
-        self.outer = kwargs.pop('outer', _outer_wrapper)
+        self.outer = kwargs.pop('outer', _outer_not_impl)
         self.outer.__name__ = 'outer'
         self.outer.__qualname__ = name + '.outer'
-        self._npy_outer = numpy_ufunc.outer
 
         reduce = kwargs.pop('reduce', None)
         if reduce is None:
-            self.reduce = _reduce_wrapper
+            self.reduce = _reduce_not_impl
         else:
             self.reduce = reduce
         self.reduce.__name__ = 'reduce'
         self.reduce.__qualname__ = name + '.reduce'
-        self._npy_reduce = numpy_ufunc.reduce
 
-        self.reduceat = kwargs.pop('reduceat', _reduceat_wrapper)
+        self.reduceat = kwargs.pop('reduceat', _reduceat_not_impl)
         self.reduceat.__name__ = 'reduceat'
         self.reduceat.__qualname__ = name + '.reduceat'
-        self._npy_reduceat = numpy_ufunc.reduceat
 
     def __repr__(self):
         return '<ufunc {}>'.format(self.name)

From 0a8c136aa485707517241d17ac4ce754d4d80f85 Mon Sep 17 00:00:00 2001
From: Holger Kohr <h.kohr@cwi.nl>
Date: Sun, 11 Jun 2017 02:48:23 +0200
Subject: [PATCH 20/29] ENH: add __array_ufunc__ interface and more ndgpuarray
 methods

---
 pygpu/_array.py | 249 +++++++++++++++++++++++++++++++-----------------
 1 file changed, 160 insertions(+), 89 deletions(-)

diff --git a/pygpu/_array.py b/pygpu/_array.py
index c1de9ea03b..ca46c36227 100644
--- a/pygpu/_array.py
+++ b/pygpu/_array.py
@@ -1,10 +1,8 @@
 from __future__ import division
-from functools import partial
-import numpy as np
+import numpy
 
-from .elemwise import elemwise1, elemwise2, ielemwise2, compare, arg, GpuElemwise, as_argument
-from .reduction import reduce1
-from .dtypes import dtype_to_ctype, get_np_obj, get_common_dtype
+from .elemwise import elemwise1, arg, GpuElemwise, as_argument
+from .dtypes import dtype_to_ctype, get_common_dtype
 from . import ufuncs
 from . import gpuarray
 
@@ -21,125 +19,126 @@ class ndgpuarray(gpuarray.GpuArray):
     more like a drop-in replacement for numpy.ndarray than the raw
     GpuArray class.
     """
+    def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
+        # Unwrap out if necessary
+        out_tuple = kwargs.pop('out', ())
+        if not all(isinstance(out, gpuarray.GpuArray) for out in out_tuple):
+            return NotImplemented
+
+        if len(out_tuple) == 1:
+            kwargs['out'] = out_tuple[0]
+        elif len(out_tuple) == 2:
+            kwargs['out1'] = out_tuple[0]
+            kwargs['out2'] = out_tuple[1]
+
+        native_ufunc = getattr(ufuncs, ufunc.__name__, None)
+        if native_ufunc is None:
+            return NotImplemented
+
+        if method == '__call__':
+            return native_ufunc(*inputs, **kwargs)
+        else:
+            function = getattr(native_ufunc, method, None)
+            return function(*inputs, **kwargs)
+
+    # The special methods call explicitly into __array_ufunc__ to make
+    # sure that the native ufuncs are used also for Numpy 1.12 and below.
+    # The only difference will be the behavior when Numpy ufuncs are
+    # used: for 1.12 and below, numpy.add(x_gpu, y) will cast x_gpu to
+    # a Numpy array, 1.13 and above calls into the native GPU ufunc.
+
     # add
     def __add__(self, other):
-        return elemwise2(self, '+', other, self, broadcast=True)
+        return self.__array_ufunc__(numpy.add, '__call__', self, other)
 
     def __radd__(self, other):
-        return elemwise2(other, '+', self, self, broadcast=True)
+        return self.__array_ufunc__(numpy.add, '__call__', other, self)
 
     def __iadd__(self, other):
-        return ielemwise2(self, '+', other, broadcast=True)
+        return self.__array_ufunc__(numpy.add, '__call__', self, other,
+                                    out=self)
 
     # sub
     def __sub__(self, other):
-        return elemwise2(self, '-', other, self, broadcast=True)
+        return self.__array_ufunc__(numpy.subtract, '__call__', self, other)
 
     def __rsub__(self, other):
-        return elemwise2(other, '-', self, self, broadcast=True)
+        return self.__array_ufunc__(numpy.subtract, '__call__', other, self)
 
     def __isub__(self, other):
-        return ielemwise2(self, '-', other, broadcast=True)
+        return self.__array_ufunc__(numpy.subtract, '__call__', self, other,
+                                    out=self)
 
     # mul
     def __mul__(self, other):
-        return elemwise2(self, '*', other, self, broadcast=True)
+        return self.__array_ufunc__(numpy.multiply, '__call__', self, other)
 
     def __rmul__(self, other):
-        return elemwise2(other, '*', self, self, broadcast=True)
+        return self.__array_ufunc__(numpy.multiply, '__call__', other, self)
 
     def __imul__(self, other):
-        return ielemwise2(self, '*', other, broadcast=True)
+        return self.__array_ufunc__(numpy.multiply, '__call__', self, other,
+                                    out=self)
 
     # div
     def __div__(self, other):
-        return elemwise2(self, '/', other, self, broadcast=True)
+        return self.__array_ufunc__(numpy.divide, '__call__', self, other)
 
     def __rdiv__(self, other):
-        return elemwise2(other, '/', self, self, broadcast=True)
+        return self.__array_ufunc__(numpy.divide, '__call__', other, self)
 
     def __idiv__(self, other):
-        return ielemwise2(self, '/', other, broadcast=True)
+        return self.__array_ufunc__(numpy.divide, '__call__', self, other,
+                                    out=self)
 
     # truediv
     def __truediv__(self, other):
-        np1 = get_np_obj(self)
-        np2 = get_np_obj(other)
-        res = (np1.__truediv__(np2)).dtype
-        return elemwise2(self, '/', other, self, odtype=res, broadcast=True)
+        return self.__array_ufunc__(numpy.true_divide, '__call__',
+                                    self, other)
 
     def __rtruediv__(self, other):
-        np1 = get_np_obj(self)
-        np2 = get_np_obj(other)
-        res = (np2.__truediv__(np1)).dtype
-        return elemwise2(other, '/', self, self, odtype=res, broadcast=True)
+        return self.__array_ufunc__(numpy.true_divide, '__call__',
+                                    other, self)
 
     def __itruediv__(self, other):
-        np2 = get_np_obj(other)
-        kw = {'broadcast': True}
-        if self.dtype == np.float32 or np2.dtype == np.float32:
-            kw['op_tmpl'] = "a = (float)a / (float)b"
-        if self.dtype == np.float64 or np2.dtype == np.float64:
-            kw['op_tmpl'] = "a = (double)a / (double)b"
-        return ielemwise2(self, '/', other, **kw)
+        return self.__array_ufunc__(numpy.true_divide, '__call__',
+                                    self, other, out=self)
 
     # floordiv
     def __floordiv__(self, other):
-        out_dtype = get_common_dtype(self, other, True)
-        kw = {'broadcast': True}
-        if out_dtype.kind == 'f':
-            kw['op_tmpl'] = "res = floor((%(out_t)s)a / (%(out_t)s)b)"
-        return elemwise2(self, '/', other, self, odtype=out_dtype, **kw)
+        return self.__array_ufunc__(numpy.floor_divide, '__call__',
+                                    self, other)
 
     def __rfloordiv__(self, other):
-        out_dtype = get_common_dtype(other, self, True)
-        kw = {'broadcast': True}
-        if out_dtype.kind == 'f':
-            kw['op_tmpl'] = "res = floor((%(out_t)s)a / (%(out_t)s)b)"
-        return elemwise2(other, '/', self, self, odtype=out_dtype, **kw)
+        return self.__array_ufunc__(numpy.floor_divide, '__call__',
+                                    other, self)
 
     def __ifloordiv__(self, other):
-        out_dtype = self.dtype
-        kw = {'broadcast': True}
-        if out_dtype == np.float32:
-            kw['op_tmpl'] = "a = floor((float)a / (float)b)"
-        if out_dtype == np.float64:
-            kw['op_tmpl'] = "a = floor((double)a / (double)b)"
-        return ielemwise2(self, '/', other, **kw)
+        return self.__array_ufunc__(numpy.floor_divide, '__call__',
+                                    self, other, out=self)
 
     # mod
     def __mod__(self, other):
-        out_dtype = get_common_dtype(self, other, True)
-        kw = {'broadcast': True}
-        if out_dtype.kind == 'f':
-            kw['op_tmpl'] = "res = fmod((%(out_t)s)a, (%(out_t)s)b)"
-        return elemwise2(self, '%', other, self, odtype=out_dtype, **kw)
+        return self.__array_ufunc__(numpy.mod, '__call__', self, other)
 
     def __rmod__(self, other):
-        out_dtype = get_common_dtype(other, self, True)
-        kw = {'broadcast': True}
-        if out_dtype.kind == 'f':
-            kw['op_tmpl'] = "res = fmod((%(out_t)s)a, (%(out_t)s)b)"
-        return elemwise2(other, '%', self, self, odtype=out_dtype, **kw)
+        return self.__array_ufunc__(numpy.mod, '__call__', other, self)
 
     def __imod__(self, other):
-        out_dtype = get_common_dtype(self, other, self.dtype == np.float64)
-        kw = {'broadcast': True}
-        if out_dtype == np.float32:
-            kw['op_tmpl'] = "a = fmod((float)a, (float)b)"
-        if out_dtype == np.float64:
-            kw['op_tmpl'] = "a = fmod((double)a, (double)b)"
-        return ielemwise2(self, '%', other, **kw)
+        return self.__array_ufunc__(numpy.mod, '__call__', self, other,
+                                    out=self)
 
     # divmod
     def __divmod__(self, other):
         if not isinstance(other, gpuarray.GpuArray):
-            other = np.asarray(other)
+            other = numpy.asarray(other)
         odtype = get_common_dtype(self, other, True)
 
         a_arg = as_argument(self, 'a', read=True)
         b_arg = as_argument(other, 'b', read=True)
-        args = [arg('div', odtype, write=True), arg('mod', odtype, write=True), a_arg, b_arg]
+        args = [arg('div', odtype, write=True),
+                arg('mod', odtype, write=True),
+                a_arg, b_arg]
 
         div = self._empty_like_me(dtype=odtype)
         mod = self._empty_like_me(dtype=odtype)
@@ -159,12 +158,14 @@ def __divmod__(self, other):
 
     def __rdivmod__(self, other):
         if not isinstance(other, gpuarray.GpuArray):
-            other = np.asarray(other)
+            other = numpy.asarray(other)
         odtype = get_common_dtype(other, self, True)
 
         a_arg = as_argument(other, 'a', read=True)
         b_arg = as_argument(self, 'b', read=True)
-        args = [arg('div', odtype, write=True), arg('mod', odtype, write=True), a_arg, b_arg]
+        args = [arg('div', odtype, write=True),
+                arg('mod', odtype, write=True),
+                a_arg, b_arg]
 
         div = self._empty_like_me(dtype=odtype)
         mod = self._empty_like_me(dtype=odtype)
@@ -182,42 +183,112 @@ def __rdivmod__(self, other):
         k(div, mod, other, self, broadcast=True)
         return (div, mod)
 
+    # unary ops
     def __neg__(self):
-        return elemwise1(self, '-')
+        return self.__array_ufunc__(numpy.negative, '__call__', self)
 
     def __pos__(self):
         return elemwise1(self, '+')
 
     def __abs__(self):
-        if self.dtype.kind == 'u':
-            return self.copy()
-        if self.dtype.kind == 'f':
-            oper = "res = fabs(a)"
-        elif self.dtype.itemsize < 4:
-            # cuda 5.5 finds the c++ stdlib definition if we don't cast here.
-            oper = "res = abs((int)a)"
-        else:
-            oper = "res = abs(a)"
-        return elemwise1(self, None, oper=oper)
+        return self.__array_ufunc__(numpy.abs, '__call__', self)
+
+    def __invert__(self):
+        return self.__array_ufunc__(numpy.invert, '__call__', self)
 
     # richcmp
     def __lt__(self, other):
-        return compare(self, '<', other, broadcast=True)
+        return self.__array_ufunc__(numpy.less, '__call__', self, other)
 
     def __le__(self, other):
-        return compare(self, '<=', other, broadcast=True)
+        return self.__array_ufunc__(numpy.less_equal, '__call__', self, other)
 
     def __eq__(self, other):
-        return compare(self, '==', other, broadcast=True)
+        return self.__array_ufunc__(numpy.equal, '__call__', self, other)
 
     def __ne__(self, other):
-        return compare(self, '!=', other, broadcast=True)
-
-    def __ge__(self, other):
-        return compare(self, '>=', other, broadcast=True)
+        return self.__array_ufunc__(numpy.not_equal, '__call__', self, other)
 
     def __gt__(self, other):
-        return compare(self, '>', other, broadcast=True)
+        return self.__array_ufunc__(numpy.greater, '__call__', self, other)
+
+    def __ge__(self, other):
+        return self.__array_ufunc__(numpy.greater_equal, '__call__', self,
+                                    other)
+
+    # pow
+    # TODO: pow can take a third modulo argument
+    def __pow__(self, other):
+        return self.__array_ufunc__(numpy.power, '__call__', self, other)
+
+    def __rpow__(self, other):
+        return self.__array_ufunc__(numpy.power, '__call__', other, self)
+
+    def __ipow__(self, other):
+        return self.__array_ufunc__(numpy.power, '__call__', self, other,
+                                    out=self)
+
+    # shifts
+    def __lshift__(self, other):
+        return self.__array_ufunc__(numpy.left_shift, '__call__',
+                                    self, other)
+
+    def __rlshift__(self, other):
+        return self.__array_ufunc__(numpy.left_shift, '__call__',
+                                    other, self)
+
+    def __ilshift__(self, other):
+        return self.__array_ufunc__(numpy.left_shift, '__call__',
+                                    self, other, out=self)
+
+    def __rshift__(self, other):
+        return self.__array_ufunc__(numpy.right_shift, '__call__',
+                                    self, other)
+
+    def __rrshift__(self, other):
+        return self.__array_ufunc__(numpy.right_shift, '__call__',
+                                    other, self)
+
+    def __irshift__(self, other):
+        return self.__array_ufunc__(numpy.right_shift, '__call__',
+                                    self, other, out=self)
+
+    # logical ops
+    def __and__(self, other):
+        return self.__array_ufunc__(numpy.logical_and, '__call__',
+                                    self, other)
+
+    def __rand__(self, other):
+        return self.__array_ufunc__(numpy.logical_and, '__call__',
+                                    other, self)
+
+    def __iand__(self, other):
+        return self.__array_ufunc__(numpy.logical_and, '__call__',
+                                    self, other, out=self)
+
+    def __or__(self, other):
+        return self.__array_ufunc__(numpy.logical_or, '__call__',
+                                    self, other)
+
+    def __ror__(self, other):
+        return self.__array_ufunc__(numpy.logical_or, '__call__',
+                                    other, self)
+
+    def __ior__(self, other):
+        return self.__array_ufunc__(numpy.logical_or, '__call__',
+                                    self, other, out=self)
+
+    def __xor__(self, other):
+        return self.__array_ufunc__(numpy.logical_xor, '__call__',
+                                    self, other)
+
+    def __rxor__(self, other):
+        return self.__array_ufunc__(numpy.logical_xor, '__call__',
+                                    other, self)
+
+    def __ixor__(self, other):
+        return self.__array_ufunc__(numpy.logical_xor, '__call__',
+                                    self, other, out=self)
 
     # misc other things
     @property

From 2c99f5721a554a984869e55d3b591b85cd8bea96 Mon Sep 17 00:00:00 2001
From: Holger Kohr <h.kohr@cwi.nl>
Date: Sun, 11 Jun 2017 03:05:34 +0200
Subject: [PATCH 21/29] MAINT: minor fixes

---
 pygpu/ufuncs.py | 51 +++++++++++++++++++++++++++++++------------------
 1 file changed, 32 insertions(+), 19 deletions(-)

diff --git a/pygpu/ufuncs.py b/pygpu/ufuncs.py
index cd36ce93d1..6a980fd7d5 100644
--- a/pygpu/ufuncs.py
+++ b/pygpu/ufuncs.py
@@ -3,6 +3,7 @@
 import mako
 import numpy
 import re
+import sys
 import warnings
 from pygpu._array import ndgpuarray
 from pygpu.dtypes import dtype_to_ctype, NAME_TO_DTYPE
@@ -11,6 +12,8 @@
 from pygpu.reduction import reduce1
 from pygpu.gpuarray import GpuArray, array, empty, get_default_context
 
+PY3 = sys.version_info.major >= 3
+
 
 # --- Helper functions --- #
 
@@ -58,9 +61,8 @@ def _prepare_array_for_reduction(a, out):
 
     if need_context:
         ctx = get_default_context()
-        cls = ndgpuarray  # TODO: sensible choice as default?
+        cls = ndgpuarray
 
-    # TODO: can CPU memory handed directly to kernels?
     if not isinstance(a, GpuArray):
         a = numpy.asarray(a)
         if a.flags.f_contiguous and not a.flags.c_contiguous:
@@ -575,9 +577,8 @@ def unary_ufunc(a, ufunc_name, out=None):
 
     if need_context:
         ctx = get_default_context()
-        cls = ndgpuarray  # TODO: sensible choice as default?
+        cls = ndgpuarray
 
-    # TODO: can CPU memory handed directly to kernels?
     if not isinstance(a, GpuArray):
         a = numpy.asarray(a)
         if a.flags.f_contiguous and not a.flags.c_contiguous:
@@ -739,11 +740,8 @@ def binary_ufunc(a, b, ufunc_name, out=None):
             break
     if need_context:
         ctx = get_default_context()
-        # TODO: sensible choice? Makes sense to choose the more "feature-rich"
-        # variant here perhaps.
         cls = ndgpuarray
 
-    # TODO: can CPU memory handed directly to kernels?
     if not isinstance(a, GpuArray):
         if numpy.isscalar(a):
             # TODO: this is quite hacky, perhaps mixed input signatures
@@ -760,6 +758,7 @@ def binary_ufunc(a, b, ufunc_name, out=None):
             order = 'C'
         a = array(a, dtype=a.dtype, copy=False, order=order, context=ctx,
                   cls=cls)
+
     if not isinstance(b, GpuArray):
         if numpy.isscalar(b):
             # TODO: this is quite hacky, perhaps mixed input signatures
@@ -981,9 +980,8 @@ def unary_ufunc_two_out(a, ufunc_name, out1=None, out2=None):
 
     if need_context:
         ctx = get_default_context()
-        cls = ndgpuarray  # TODO: sensible choice as default?
+        cls = ndgpuarray
 
-    # TODO: can CPU memory handed directly to kernels?
     if not isinstance(a, GpuArray):
         a = numpy.asarray(a)
         if a.flags.f_contiguous and not a.flags.c_contiguous:
@@ -1078,9 +1076,7 @@ def unary_ufunc_two_out(a, ufunc_name, out1=None, out2=None):
 
 
 def make_binary_ufunc_reduce(name):
-
     npy_ufunc = getattr(numpy, name)
-
     binop = BINARY_UFUNC_TO_C_BINOP.get(name, None)
     if binop is not None:
 
@@ -1153,15 +1149,19 @@ def _reduceat_not_impl(a, indices, axis=0, dtype=None, out=None):
             return NotImplemented
 
         self.accumulate = kwargs.pop('accumulate', _accumulate_not_impl)
-        self.accumulate.__name__ = self.accumulate.__qualname__ = 'accumulate'
+        self.accumulate.__name__ = 'accumulate'
+        if PY3:
+            self.accumulate.__qualname__ = 'accumulate'
 
         self.at = kwargs.pop('at', _at_not_impl)
-        self.at.__qualname__ = name + '.at'
         self.at.__name__ = 'at'
+        if PY3:
+            self.at.__qualname__ = name + '.at'
 
         self.outer = kwargs.pop('outer', _outer_not_impl)
         self.outer.__name__ = 'outer'
-        self.outer.__qualname__ = name + '.outer'
+        if PY3:
+            self.outer.__qualname__ = name + '.outer'
 
         reduce = kwargs.pop('reduce', None)
         if reduce is None:
@@ -1169,11 +1169,13 @@ def _reduceat_not_impl(a, indices, axis=0, dtype=None, out=None):
         else:
             self.reduce = reduce
         self.reduce.__name__ = 'reduce'
-        self.reduce.__qualname__ = name + '.reduce'
+        if PY3:
+            self.reduce.__qualname__ = name + '.reduce'
 
         self.reduceat = kwargs.pop('reduceat', _reduceat_not_impl)
         self.reduceat.__name__ = 'reduceat'
-        self.reduceat.__qualname__ = name + '.reduceat'
+        if PY3:
+            self.reduceat.__qualname__ = name + '.reduceat'
 
     def __repr__(self):
         return '<ufunc {}>'.format(self.name)
@@ -1207,9 +1209,13 @@ def __call__(self, x1, x2, out=None):
 
 
 def make_unary_ufunc(name, doc):
+
     def wrapper(a, out=None):
         return unary_ufunc(a, name, out)
-    wrapper.__qualname__ = wrapper.__name__ = name
+
+    wrapper.__name__ = name
+    if PY3:
+        wrapper.__qualname__ = name
     wrapper.__doc__ = doc
     return wrapper
 
@@ -1229,9 +1235,13 @@ def wrapper(a, out=None):
 
 
 def make_unary_ufunc_two_out(name, doc):
+
     def wrapper(a, out1=None, out2=None):
         return unary_ufunc_two_out(a, name, out1, out2)
-    wrapper.__qualname__ = wrapper.__name__ = name
+
+    wrapper.__name__ = name
+    if PY3:
+        wrapper.__qualname__ = name
     wrapper.__doc__ = doc
     return wrapper
 
@@ -1249,10 +1259,13 @@ def wrapper(a, out1=None, out2=None):
 
 
 def make_binary_ufunc(name, doc):
+
     def wrapper(a, b, out=None):
         return binary_ufunc(a, b, name, out)
 
-    wrapper.__qualname__ = wrapper.__name__ = name
+    wrapper.__name__ = name
+    if PY3:
+        wrapper.__qualname__ = name
     wrapper.__doc__ = doc
     return wrapper
 

From bc14f5256a764e7af4a78c8157d3e8510dd16a93 Mon Sep 17 00:00:00 2001
From: Holger Kohr <h.kohr@cwi.nl>
Date: Mon, 12 Jun 2017 18:19:27 +0200
Subject: [PATCH 22/29] BUG: fix circular imports

---
 pygpu/ufuncs.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/pygpu/ufuncs.py b/pygpu/ufuncs.py
index 6a980fd7d5..3c8329ef14 100644
--- a/pygpu/ufuncs.py
+++ b/pygpu/ufuncs.py
@@ -5,7 +5,6 @@
 import re
 import sys
 import warnings
-from pygpu._array import ndgpuarray
 from pygpu.dtypes import dtype_to_ctype, NAME_TO_DTYPE
 from pygpu._elemwise import arg
 from pygpu.elemwise import as_argument, GpuElemwise
@@ -45,6 +44,9 @@ def reduce_dims(shape, red_axes):
 
 def _prepare_array_for_reduction(a, out):
     """Return input array ready for usage in a reduction kernel."""
+    # Lazy import to avoid circular dependency
+    from pygpu._array import ndgpuarray
+
     # Get a context and an array class to work with. Use the "highest"
     # class present in the inputs.
     need_context = True
@@ -561,6 +563,9 @@ def unary_ufunc(a, ufunc_name, out=None):
     ufunc_result_dtype : Get dtype of a ufunc result.
     ufunc_c_funcname : Get C name for math ufuncs.
     """
+    # Lazy import to avoid circular dependency
+    from pygpu._array import ndgpuarray
+
     # Get a context and an array class to work with. Use the "highest"
     # class present in the inputs.
     need_context = True
@@ -730,6 +735,10 @@ def binary_ufunc(a, b, ufunc_name, out=None):
     pygpu.gpuarray.set_default_context
     """
     from builtins import any
+
+    # Lazy import to avoid circular dependency
+    from pygpu._array import ndgpuarray
+
     # Get a context and an array class to work with
     need_context = True
     for ary in (a, b, out):
@@ -964,6 +973,9 @@ def unary_ufunc_two_out(a, ufunc_name, out1=None, out2=None):
     ufunc_result_dtype : Get dtype of a ufunc result.
     ufunc_c_funcname : Get C name for math ufuncs.
     """
+    # Lazy import to avoid circular dependency
+    from pygpu._array import ndgpuarray
+
     # Get a context and an array class to work with. Use the "highest"
     # class present in the inputs.
     need_context = True

From 627a448ebd2132b1695896946b10c80a2e3e1b41 Mon Sep 17 00:00:00 2001
From: Holger Kohr <h.kohr@cwi.nl>
Date: Mon, 12 Jun 2017 18:24:16 +0200
Subject: [PATCH 23/29] BUG: fix qualname of accumulate

---
 pygpu/ufuncs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pygpu/ufuncs.py b/pygpu/ufuncs.py
index 3c8329ef14..e2878dd2c9 100644
--- a/pygpu/ufuncs.py
+++ b/pygpu/ufuncs.py
@@ -1163,7 +1163,7 @@ def _reduceat_not_impl(a, indices, axis=0, dtype=None, out=None):
         self.accumulate = kwargs.pop('accumulate', _accumulate_not_impl)
         self.accumulate.__name__ = 'accumulate'
         if PY3:
-            self.accumulate.__qualname__ = 'accumulate'
+            self.accumulate.__qualname__ = name + '.accumulate'
 
         self.at = kwargs.pop('at', _at_not_impl)
         self.at.__name__ = 'at'

From b6a89010b03eca427e55d10c05e8e013f98a2766 Mon Sep 17 00:00:00 2001
From: Holger Kohr <h.kohr@cwi.nl>
Date: Mon, 12 Jun 2017 18:36:38 +0200
Subject: [PATCH 24/29] BUG: remove imports from builtin

---
 pygpu/ufuncs.py | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/pygpu/ufuncs.py b/pygpu/ufuncs.py
index e2878dd2c9..c213f55f6f 100644
--- a/pygpu/ufuncs.py
+++ b/pygpu/ufuncs.py
@@ -13,6 +13,10 @@
 
 PY3 = sys.version_info.major >= 3
 
+# Save for later use since the original names are used for reductions
+builtin_all = all
+builtin_any = any
+
 
 # --- Helper functions --- #
 
@@ -425,7 +429,6 @@ def ufunc_dtypes(ufunc_name, dtypes_in):
     >>> ufunc_dtypes('power', [numpy.dtype('int8'), numpy.dtype('float32')])
     ((dtype('float32'), dtype('float32')), (dtype('float32'),))
     """
-    from builtins import all, any
     npy_ufunc = getattr(numpy, ufunc_name)
     supported_dtypes = set(NAME_TO_DTYPE.values())
 
@@ -437,10 +440,11 @@ def larger_eq_than_dtypes(sig):
         else:
             dts = tuple(numpy.dtype(c) for c in from_part)
             # Currently unsupported, filtering out
-            if any(dt not in supported_dtypes for dt in dts):
+            if builtin_any(dt not in supported_dtypes for dt in dts):
                 return False
             else:
-                return all(dt >= dt_in for dt, dt_in in zip(dts, dtypes_in))
+                return builtin_all(dt >= dt_in
+                                   for dt, dt_in in zip(dts, dtypes_in))
 
     # List of ufunc signatures that are "larger" than our input dtypes
     larger_sig_list = list(filter(larger_eq_than_dtypes, npy_ufunc.types))
@@ -464,8 +468,8 @@ def from_part_key(sig):
     result_dtypes = tuple(numpy.dtype(c) for c in smallest_str_out)
 
     # Quad precision unsupported also on output side
-    if any(dt in result_dtypes for dt in (numpy.dtype('float16'),
-                                          numpy.dtype('float128'))):
+    if builtin_any(dt in result_dtypes for dt in (numpy.dtype('float16'),
+                                                  numpy.dtype('float128'))):
         # TODO: Numpy raises TypeError for bad data types, which is wrong,
         # but we mirror that behavior
         raise TypeError('data types {} not supported for ufunc {}'
@@ -734,8 +738,6 @@ def binary_ufunc(a, b, ufunc_name, out=None):
     --------
     pygpu.gpuarray.set_default_context
     """
-    from builtins import any
-
     # Lazy import to avoid circular dependency
     from pygpu._array import ndgpuarray
 
@@ -789,7 +791,7 @@ def binary_ufunc(a, b, ufunc_name, out=None):
         b = array(b, dtype=b.dtype, copy=False, order=order, context=ctx,
                   cls=cls)
 
-    if any(ary.dtype == numpy.dtype('float16') for ary in (a, b)):
+    if builtin_any(ary.dtype == numpy.dtype('float16') for ary in (a, b)):
         # Gives wrong results currently, see
         # https://github.com/Theano/libgpuarray/issues/316
         raise NotImplementedError('float16 currently broken')
@@ -800,7 +802,7 @@ def binary_ufunc(a, b, ufunc_name, out=None):
     result_dtype = result_dtypes[0]
 
     # This is the "fallback signature" case, for us it signals failure
-    if any(dt == numpy.dtype(object) for dt in prom_dtypes_in):
+    if builtin_any(dt == numpy.dtype(object) for dt in prom_dtypes_in):
         raise TypeError('input dtypes {} invalid for ufunc {!r}'
                         ''.format((a.dtype.name, b.dtype.name), ufunc_name))
 

From 48906b77fd7626aca588fddcd830201c4669991c Mon Sep 17 00:00:00 2001
From: Holger Kohr <h.kohr@cwi.nl>
Date: Mon, 12 Jun 2017 19:00:21 +0200
Subject: [PATCH 25/29] MAINT: raise TypeError for deprecated boolean ops in
 numpy >= 1.13

---
 pygpu/tests/test_ufuncs.py |  4 ++++
 pygpu/ufuncs.py            | 35 +++++++++++++++++++++++++++++------
 2 files changed, 33 insertions(+), 6 deletions(-)

diff --git a/pygpu/tests/test_ufuncs.py b/pygpu/tests/test_ufuncs.py
index 51312ec9e3..efe6a28731 100644
--- a/pygpu/tests/test_ufuncs.py
+++ b/pygpu/tests/test_ufuncs.py
@@ -521,6 +521,10 @@ def check_binary_ufunc_reduce(ufunc, axis, keepdims):
 
     gpuary_result = gpuary_ufunc.reduce(gpuary_arr, axis=axis,
                                         keepdims=keepdims)
+    if gpuary_result is NotImplemented:
+        # Nothing to check
+        return
+
     if numpy.isscalar(npy_result):
         assert numpy.isscalar(gpuary_result)
         assert numpy.isclose(gpuary_result, npy_result, rtol=rtol,
diff --git a/pygpu/ufuncs.py b/pygpu/ufuncs.py
index c213f55f6f..c782f91562 100644
--- a/pygpu/ufuncs.py
+++ b/pygpu/ufuncs.py
@@ -2,6 +2,7 @@
 
 import mako
 import numpy
+from pkg_resources import parse_version
 import re
 import sys
 import warnings
@@ -645,11 +646,18 @@ def unary_ufunc(a, ufunc_name, out=None):
     unop = UNARY_UFUNC_TO_C_OP.get(ufunc_name, None)
     if unop is not None:
         if a.dtype == numpy.bool and unop == '-':
-            warnings.warn('using negation (`-`) with boolean arrays is '
-                          'deprecated, use logical not (`~`) instead; '
-                          'the current behavior will be changed along with '
-                          "NumPy's", FutureWarning)
-            unop = '!'
+            if parse_version(numpy.__version__) >= parse_version('1.13'):
+                # Numpy >= 1.13 raises a TypeError
+                raise TypeError(
+                    'negation of boolean arrays is not supported, use '
+                    '`logical_not` instead')
+            else:
+                # Warn and remap to logical not
+                warnings.warn('using negation (`-`) with boolean arrays is '
+                              'deprecated, use `logical_not` (`~`) instead; '
+                              'the current behavior will be changed along '
+                              "with NumPy's", FutureWarning)
+                unop = '!'
         oper = 'res = ({}) {}a'.format(c_res_dtype, unop)
 
     # Other cases: specific functions
@@ -867,6 +875,21 @@ def binary_ufunc(a, b, ufunc_name, out=None):
     # Case 2: binary operator
     binop = BINARY_UFUNC_TO_C_OP.get(ufunc_name, None)
     if binop is not None:
+        if b.dtype == numpy.bool and binop == '-':
+            if parse_version(numpy.__version__) >= parse_version('1.13'):
+                # Numpy >= 1.13 raises a TypeError
+                raise TypeError(
+                    'subtraction of boolean arrays is not supported, use '
+                    '`logical_not` instead')
+            else:
+                # Warn and remap to logical not
+                warnings.warn('using subtraction (`-`) with boolean arrays is '
+                              'deprecated, use `bitwise_xor` (`^`) or '
+                              '`logical_xor` instead; '
+                              'the current behavior will be changed along '
+                              "with NumPy's", FutureWarning)
+                binop = '^'
+
         oper = 'res = ({}) (a {} b)'.format(c_res_dtype, binop)
     else:
         # Other cases: specific functions
@@ -887,7 +910,7 @@ def binary_ufunc(a, b, ufunc_name, out=None):
             oper = 'res = ({}) floor_div_dbl((double) a, (double) b)'.format(
                 c_res_dtype)
 
-        if ufunc_name == 'true_divide':
+        elif ufunc_name == 'true_divide':
             if result_dtype == numpy.dtype('float64'):
                 flt = 'double'
             else:

From 0bf6043dca3b79aaa97bd0600182460d1d0620d4 Mon Sep 17 00:00:00 2001
From: Holger Kohr <h.kohr@cwi.nl>
Date: Mon, 12 Jun 2017 22:04:01 +0200
Subject: [PATCH 26/29] TST: avoid failing test due to not implemented
 reduction

---
 pygpu/tests/test_ufuncs.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/pygpu/tests/test_ufuncs.py b/pygpu/tests/test_ufuncs.py
index efe6a28731..4246565963 100644
--- a/pygpu/tests/test_ufuncs.py
+++ b/pygpu/tests/test_ufuncs.py
@@ -466,13 +466,13 @@ def test_binary_ufuncs_reduce():
                                                             ['float32'])):
                     pass
                 elif ufunc.startswith('bitwise') or ufunc.endswith('shift'):
-                    # Invalid source
+                    # Invalid source error from CUDA
                     pass
                 elif ufunc.startswith('logical'):
                     # Wrong resulting dtype (float instead of bool)
                     pass
                 elif ufunc in ('subtract',):
-                    # Not reorderable
+                    # Not reorderable, thus only applicable along one axis
                     pass
                 else:
                     yield check_binary_ufunc_reduce, ufunc, axis, keepdims
@@ -489,7 +489,7 @@ def test_binary_ufuncs_reduce():
                 elif ufunc in ('equal', 'not_equal',
                                'greater', 'greater_equal',
                                'less', 'less_equal'):
-                    # Not reorderable
+                    # Not reorderable, thus only applicable along one axis
                     pass
                 else:
                     yield check_binary_ufunc_reduce, ufunc, axis, keepdims
@@ -517,6 +517,10 @@ def check_binary_ufunc_reduce(ufunc, axis, keepdims):
         with assert_raises(TypeError):
             gpuary_result = gpuary_ufunc.reduce(gpuary_arr, axis=axis,
                                                 keepdims=keepdims)
+            # Emulate the TypeError if there is no native implementation
+            # and Numpy raises
+            if gpuary_result is NotImplemented:
+                raise TypeError
         return
 
     gpuary_result = gpuary_ufunc.reduce(gpuary_arr, axis=axis,

From 9f0a57837aae63ea4e6fc3f26e0bedc46b2eaaa1 Mon Sep 17 00:00:00 2001
From: Holger Kohr <h.kohr@cwi.nl>
Date: Mon, 12 Jun 2017 23:12:49 +0200
Subject: [PATCH 27/29] BUG: fix missing precision in deg2rad constant

---
 pygpu/ufuncs.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pygpu/ufuncs.py b/pygpu/ufuncs.py
index c782f91562..d46ca3b662 100644
--- a/pygpu/ufuncs.py
+++ b/pygpu/ufuncs.py
@@ -662,12 +662,12 @@ def unary_ufunc(a, ufunc_name, out=None):
 
     # Other cases: specific functions
     if ufunc_name == 'deg2rad':
-        oper = 'res = ({rdt})({}) * ({rdt}) a'.format(numpy.deg2rad(1),
-                                                      rdt=c_res_dtype)
+        oper = 'res = ({rdt})({:.45f}) * ({rdt}) a'.format(numpy.deg2rad(1.0),
+                                                           rdt=c_res_dtype)
 
     elif ufunc_name == 'rad2deg':
-        oper = 'res = ({rdt})({}) * ({rdt}) a'.format(numpy.rad2deg(1),
-                                                      rdt=c_res_dtype)
+        oper = 'res = ({rdt})({:.45f}) * ({rdt}) a'.format(numpy.rad2deg(1.0),
+                                                           rdt=c_res_dtype)
 
     elif ufunc_name == 'reciprocal':
         oper = 'res = ({dt}) (({dt}) 1.0) / a'.format(dt=c_res_dtype)

From dd88c9a35dc729d4d725f68e8774eafa027cef93 Mon Sep 17 00:00:00 2001
From: Holger Kohr <h.kohr@cwi.nl>
Date: Tue, 13 Jun 2017 00:29:57 +0200
Subject: [PATCH 28/29] MAINT: add optional context parameter to ufuncs and
 friends

---
 pygpu/ufuncs.py | 256 +++++++++++++++++++++++-------------------------
 1 file changed, 123 insertions(+), 133 deletions(-)

diff --git a/pygpu/ufuncs.py b/pygpu/ufuncs.py
index d46ca3b662..905c7dd853 100644
--- a/pygpu/ufuncs.py
+++ b/pygpu/ufuncs.py
@@ -3,7 +3,6 @@
 import mako
 import numpy
 from pkg_resources import parse_version
-import re
 import sys
 import warnings
 from pygpu.dtypes import dtype_to_ctype, NAME_TO_DTYPE
@@ -47,7 +46,7 @@ def reduce_dims(shape, red_axes):
 # --- Reductions with arithmetic operators --- #
 
 
-def _prepare_array_for_reduction(a, out):
+def _prepare_array_for_reduction(a, out, context=None):
     """Return input array ready for usage in a reduction kernel."""
     # Lazy import to avoid circular dependency
     from pygpu._array import ndgpuarray
@@ -55,19 +54,18 @@ def _prepare_array_for_reduction(a, out):
     # Get a context and an array class to work with. Use the "highest"
     # class present in the inputs.
     need_context = True
-    ctx = None
     cls = None
     for ary in (a, out):
         if isinstance(ary, GpuArray):
-            if ctx is not None and ary.context != ctx:
+            if context is not None and ary.context != context:
                 raise ValueError('cannot mix contexts')
-            ctx = ary.context
+            context = ary.context
             if cls is None or cls == GpuArray:
                 cls = ary.__class__
             need_context = False
 
-    if need_context:
-        ctx = get_default_context()
+    if need_context and context is None:
+        context = get_default_context()
         cls = ndgpuarray
 
     if not isinstance(a, GpuArray):
@@ -76,14 +74,14 @@ def _prepare_array_for_reduction(a, out):
             order = 'F'
         else:
             order = 'C'
-        a = array(a, dtype=a.dtype, copy=False, order=order, context=ctx,
+        a = array(a, dtype=a.dtype, copy=False, order=order, context=context,
                   cls=cls)
 
     return a
 
 
 def reduce_with_op(a, op, neutral, axis=None, dtype=None, out=None,
-                   keepdims=False):
+                   keepdims=False, context=None):
     """Reduce ``a`` using the operation ``op``.
 
     This is a wrapper around `pygpu.reduction.reduce1` with signature
@@ -104,6 +102,10 @@ def reduce_with_op(a, op, neutral, axis=None, dtype=None, out=None,
 
     axis, dtype, out, keepdims :
         Arguments as in NumPy reductions. See e.g. `numpy.sum`.
+    context : `pygpu.gpuarray.GpuContext`, optional
+        Use this GPU context to evaluate the GPU kernel. For ``None``,
+        and if neither ``out`` nor ``a`` are GPU arrays, a default
+        GPU context must have been set.
 
     Returns
     -------
@@ -113,7 +115,7 @@ def reduce_with_op(a, op, neutral, axis=None, dtype=None, out=None,
         reductions along all axes without ``out`` parameter, the result
         is a scalar.
     """
-    a = _prepare_array_for_reduction(a, out)
+    a = _prepare_array_for_reduction(a, out, context)
 
     if dtype is None:
         if numpy.issubsctype(a.dtype, numpy.unsignedinteger):
@@ -145,7 +147,7 @@ def reduce_with_op(a, op, neutral, axis=None, dtype=None, out=None,
         return r
 
 
-def sum(a, axis=None, dtype=None, out=None, keepdims=False):
+def sum(a, axis=None, dtype=None, out=None, keepdims=False, context=None):
     """Sum of array elements over a given axis.
 
     See Also
@@ -155,10 +157,10 @@ def sum(a, axis=None, dtype=None, out=None, keepdims=False):
     # Do what Numpy does with booleans, sensible or not
     if a.dtype == bool and dtype is None:
         dtype = int
-    return reduce_with_op(a, '+', 0, axis, dtype, out, keepdims)
+    return reduce_with_op(a, '+', 0, axis, dtype, out, keepdims, context)
 
 
-def prod(a, axis=None, dtype=None, out=None, keepdims=False):
+def prod(a, axis=None, dtype=None, out=None, keepdims=False, context=None):
     """Return the product of array elements over a given axis.
 
     See Also
@@ -168,33 +170,34 @@ def prod(a, axis=None, dtype=None, out=None, keepdims=False):
     # Do what Numpy does with booleans, sensible or not
     if a.dtype == bool and dtype is None:
         dtype = int
-    return reduce_with_op(a, '*', 1, axis, dtype, out, keepdims)
+    return reduce_with_op(a, '*', 1, axis, dtype, out, keepdims, context)
 
 
-def all(a, axis=None, out=None, keepdims=False):
+def all(a, axis=None, out=None, keepdims=False, context=None):
     """Test whether all array elements along a given axis evaluate to True.
 
     See Also
     --------
     numpy.all
     """
-    return reduce_with_op(a, '&&', 1, axis, numpy.bool, out, keepdims)
+    return reduce_with_op(a, '&&', 1, axis, numpy.bool, out, keepdims, context)
 
 
-def any(a, axis=None, out=None, keepdims=False):
+def any(a, axis=None, out=None, keepdims=False, context=None):
     """Test whether all array elements along a given axis evaluate to True.
 
     See Also
     --------
     numpy.all
     """
-    return reduce_with_op(a, '||', 0, axis, numpy.bool, out, keepdims)
+    return reduce_with_op(a, '||', 0, axis, numpy.bool, out, keepdims, context)
 
 
 # --- Reductions with comparison operators --- #
 
 
-def reduce_with_cmp(a, cmp, neutral, axis=None, out=None, keepdims=False):
+def reduce_with_cmp(a, cmp, neutral, axis=None, out=None, keepdims=False,
+                    context=None):
     """Reduce ``a`` by comparison using ``cmp``.
 
     This is a wrapper around `pygpu.reduction.reduce1` with signature
@@ -216,6 +219,10 @@ def reduce_with_cmp(a, cmp, neutral, axis=None, out=None, keepdims=False):
 
     axis, out, keepdims :
         Arguments as in NumPy reductions. See e.g. `numpy.amax`.
+    context : `pygpu.gpuarray.GpuContext`, optional
+        Use this GPU context to evaluate the GPU kernel. For ``None``,
+        and if neither ``out`` nor ``a`` are GPU arrays, a default
+        GPU context must have been set.
 
     Returns
     -------
@@ -225,7 +232,7 @@ def reduce_with_cmp(a, cmp, neutral, axis=None, out=None, keepdims=False):
         reductions along all axes without ``out`` parameter, the result
         is a scalar.
     """
-    a = _prepare_array_for_reduction(a, out)
+    a = _prepare_array_for_reduction(a, out, context)
 
     axes = axis if axis is not None else tuple(range(a.ndim))
     if out is not None:
@@ -248,7 +255,7 @@ def reduce_with_cmp(a, cmp, neutral, axis=None, out=None, keepdims=False):
         return r
 
 
-def amin(a, axis=None, out=None, keepdims=False):
+def amin(a, axis=None, out=None, keepdims=False, context=None):
     """Return the minimum of an array or minimum along an axis.
 
     See Also
@@ -268,10 +275,10 @@ def amin(a, axis=None, out=None, keepdims=False):
     else:
         raise ValueError('array dtype {!r} not supported'
                          ''.format(a.dtype.name))
-    return reduce_with_cmp(a, '<', neutral, axis, out, keepdims)
+    return reduce_with_cmp(a, '<', neutral, axis, out, keepdims, context)
 
 
-def amax(a, axis=None, out=None, keepdims=False):
+def amax(a, axis=None, out=None, keepdims=False, context=None):
     """Return the maximum of an array or minimum along an axis.
 
     See Also
@@ -291,7 +298,7 @@ def amax(a, axis=None, out=None, keepdims=False):
     else:
         raise ValueError('array dtype {!r} not supported'
                          ''.format(a.dtype.name))
-    return reduce_with_cmp(a, '>', neutral, axis, out, keepdims)
+    return reduce_with_cmp(a, '>', neutral, axis, out, keepdims, context)
 
 
 # --- Elementwise ufuncs --- #
@@ -540,7 +547,7 @@ def ufunc_c_fname(ufunc_name, dtypes_in):
     return prefix + c_base_name
 
 
-def unary_ufunc(a, ufunc_name, out=None):
+def unary_ufunc(a, ufunc_name, out=None, context=None):
     """Call a unary ufunc on an array ``a``.
 
     Parameters
@@ -553,7 +560,9 @@ def unary_ufunc(a, ufunc_name, out=None):
         Array in which to store the result. Its shape must be equal to
         ``a.shape`` and its dtype must be the result dtype of the called
         function.
-        If ``out=None`` and ``a`` is not a GPU array, a default
+    context : `pygpu.gpuarray.GpuContext`, optional
+        Use this GPU context to evaluate the GPU kernel. For ``None``,
+        and if neither ``out`` nor ``a`` are GPU arrays, a default
         GPU context must have been set.
 
     Returns
@@ -574,19 +583,18 @@ def unary_ufunc(a, ufunc_name, out=None):
     # Get a context and an array class to work with. Use the "highest"
     # class present in the inputs.
     need_context = True
-    ctx = None
     cls = None
     for ary in (a, out):
         if isinstance(ary, GpuArray):
-            if ctx is not None and ary.context != ctx:
+            if context is not None and ary.context != context:
                 raise ValueError('cannot mix contexts')
-            ctx = ary.context
+            context = ary.context
             if cls is None or cls == GpuArray:
                 cls = ary.__class__
             need_context = False
 
-    if need_context:
-        ctx = get_default_context()
+    if need_context and context is None:
+        context = get_default_context()
         cls = ndgpuarray
 
     if not isinstance(a, GpuArray):
@@ -595,7 +603,7 @@ def unary_ufunc(a, ufunc_name, out=None):
             order = 'F'
         else:
             order = 'C'
-        a = array(a, dtype=a.dtype, copy=False, order=order, context=ctx,
+        a = array(a, dtype=a.dtype, copy=False, order=order, context=context,
                   cls=cls)
 
     if a.dtype == numpy.dtype('float16'):
@@ -618,7 +626,7 @@ def unary_ufunc(a, ufunc_name, out=None):
     a = a.astype(prom_dtype_in, copy=False)
 
     if out is None:
-        out = empty(a.shape, dtype=result_dtype, context=ctx, cls=cls)
+        out = empty(a.shape, dtype=result_dtype, context=context, cls=cls)
     else:
         # TODO: allow larger dtype
         if out.dtype != result_dtype:
@@ -714,12 +722,12 @@ def unary_ufunc(a, ufunc_name, out=None):
     a_arg = as_argument(a, 'a', read=True)
     args = [arg('res', out.dtype, write=True), a_arg]
 
-    ker = GpuElemwise(ctx, oper, args, convert_f16=True)
+    ker = GpuElemwise(context, oper, args, convert_f16=True)
     ker(out, a)
     return out
 
 
-def binary_ufunc(a, b, ufunc_name, out=None):
+def binary_ufunc(a, b, ufunc_name, out=None, context=None):
     """Call binary ufunc on ``a`` and ``b``.
 
     Parameters
@@ -732,7 +740,9 @@ def binary_ufunc(a, b, ufunc_name, out=None):
         Array in which to store the result. Its shape must be equal to
         ``a.shape`` and its dtype must be the result dtype of the called
         function.
-        If ``out=None`` and ``a, b`` are both not GPU arrays, a default
+    context : `pygpu.gpuarray.GpuContext`, optional
+        Use this GPU context to evaluate the GPU kernel. For ``None``,
+        and if neither ``out`` nor ``a`` are GPU arrays, a default
         GPU context must have been set.
 
     Returns
@@ -750,15 +760,17 @@ def binary_ufunc(a, b, ufunc_name, out=None):
     from pygpu._array import ndgpuarray
 
     # Get a context and an array class to work with
+    cls = None
     need_context = True
     for ary in (a, b, out):
         if isinstance(a, GpuArray):
-            ctx = ary.context
+            context = ary.context
             cls = ary.__class__
             need_context = False
             break
-    if need_context:
-        ctx = get_default_context()
+
+    if need_context and context is None:
+        context = get_default_context()
         cls = ndgpuarray
 
     if not isinstance(a, GpuArray):
@@ -767,6 +779,8 @@ def binary_ufunc(a, b, ufunc_name, out=None):
             # should be handled in ufunc_dtypes?
             if ufunc_name == 'ldexp':
                 # Want signed type
+                # TODO: this upcasts to a larger dtype than Numpy in
+                # some cases (mostly unsigned int)
                 a = numpy.asarray(a, dtype=numpy.min_scalar_type(-abs(a)))
             else:
                 a = numpy.asarray(a, dtype=numpy.result_type(a, b))
@@ -775,7 +789,7 @@ def binary_ufunc(a, b, ufunc_name, out=None):
             order = 'F'
         else:
             order = 'C'
-        a = array(a, dtype=a.dtype, copy=False, order=order, context=ctx,
+        a = array(a, dtype=a.dtype, copy=False, order=order, context=context,
                   cls=cls)
 
     if not isinstance(b, GpuArray):
@@ -796,7 +810,7 @@ def binary_ufunc(a, b, ufunc_name, out=None):
         else:
             b = numpy.asarray(b)
             order = 'C'
-        b = array(b, dtype=b.dtype, copy=False, order=order, context=ctx,
+        b = array(b, dtype=b.dtype, copy=False, order=order, context=context,
                   cls=cls)
 
     if builtin_any(ary.dtype == numpy.dtype('float16') for ary in (a, b)):
@@ -828,7 +842,7 @@ def binary_ufunc(a, b, ufunc_name, out=None):
     result_shape = tuple(max(sa, sb) for sa, sb in zip(a.shape, b.shape))
 
     if out is None:
-        out = empty(result_shape, dtype=result_dtype, context=ctx, cls=cls)
+        out = empty(result_shape, dtype=result_dtype, context=context, cls=cls)
     else:
         if out.shape != result_shape:
             raise ValueError('`out.shape` != result shape ({} != {})'
@@ -967,7 +981,7 @@ def binary_ufunc(a, b, ufunc_name, out=None):
     return out
 
 
-def unary_ufunc_two_out(a, ufunc_name, out1=None, out2=None):
+def unary_ufunc_two_out(a, ufunc_name, out1=None, out2=None, context=None):
     """Call a unary ufunc with two outputs on an array ``a``.
 
     Parameters
@@ -981,8 +995,11 @@ def unary_ufunc_two_out(a, ufunc_name, out1=None, out2=None):
         ``a.shape`` and their dtype must be the result dtypes of the
         called function.
         The arrays ``out1`` and ``out2`` must either both be provided,
-        or none of them. If both are ``None`` and ``a`` is not a
-        GPU array, a default GPU context must have been set.
+        or none of them.
+    context : `pygpu.gpuarray.GpuContext`, optional
+        Use this GPU context to evaluate the GPU kernel. For ``None``,
+        and if neither ``out1`` nor ``out2`` nor ``a`` are GPU arrays,
+        a default GPU context must have been set.
 
     Returns
     -------
@@ -1004,19 +1021,18 @@ def unary_ufunc_two_out(a, ufunc_name, out1=None, out2=None):
     # Get a context and an array class to work with. Use the "highest"
     # class present in the inputs.
     need_context = True
-    ctx = None
     cls = None
     for ary in (a, out1, out2):
         if isinstance(ary, GpuArray):
-            if ctx is not None and ary.context != ctx:
+            if context is not None and ary.context != context:
                 raise ValueError('cannot mix contexts')
-            ctx = ary.context
+            context = ary.context
             if cls is None or cls == GpuArray:
                 cls = ary.__class__
             need_context = False
 
-    if need_context:
-        ctx = get_default_context()
+    if need_context and context is None:
+        context = get_default_context()
         cls = ndgpuarray
 
     if not isinstance(a, GpuArray):
@@ -1025,7 +1041,7 @@ def unary_ufunc_two_out(a, ufunc_name, out1=None, out2=None):
             order = 'F'
         else:
             order = 'C'
-        a = array(a, dtype=a.dtype, copy=False, order=order, context=ctx,
+        a = array(a, dtype=a.dtype, copy=False, order=order, context=context,
                   cls=cls)
 
     if a.dtype == numpy.dtype('float16'):
@@ -1048,14 +1064,14 @@ def unary_ufunc_two_out(a, ufunc_name, out1=None, out2=None):
     a = a.astype(prom_dtype_in, copy=False)
 
     if out1 is None:
-        out1 = empty(a.shape, dtype=result_dtype1, context=ctx, cls=cls)
+        out1 = empty(a.shape, dtype=result_dtype1, context=context, cls=cls)
     else:
         # TODO: allow larger dtype
         if out1.dtype != result_dtype1:
             raise ValueError('`out1.dtype` != result dtype: {!r} != {!r}'
                              ''.format(out1.dtype.name, result_dtype1.name))
     if out2 is None:
-        out2 = empty(a.shape, dtype=result_dtype2, context=ctx, cls=cls)
+        out2 = empty(a.shape, dtype=result_dtype2, context=context, cls=cls)
     else:
         if out2.dtype != result_dtype2:
             raise ValueError('`out2.dtype` != result dtype: {!r} != {!r}'
@@ -1080,7 +1096,7 @@ def unary_ufunc_two_out(a, ufunc_name, out1=None, out2=None):
             arg('out2', out2.dtype, write=True),
             a_arg]
 
-    ker = GpuElemwise(ctx, oper, args, convert_f16=True)
+    ker = GpuElemwise(context, oper, args, convert_f16=True)
     ker(out1, out2, a)
     return out1, out2
 
@@ -1117,18 +1133,20 @@ def make_binary_ufunc_reduce(name):
     binop = BINARY_UFUNC_TO_C_BINOP.get(name, None)
     if binop is not None:
 
-        def reduce_wrapper(a, axis=0, dtype=None, out=None, keepdims=False):
+        def reduce_wrapper(a, axis=0, dtype=None, out=None, keepdims=False,
+                           context=None):
             return reduce_with_op(a, binop, npy_ufunc.identity, axis, dtype,
-                                  out, keepdims)
+                                  out, keepdims, context)
 
         return reduce_wrapper
 
     cmp = BINARY_UFUNC_TO_C_CMP.get(name, None)
     if cmp is not None:
 
-        def reduce_wrapper(a, axis=0, dtype=None, out=None, keepdims=False):
+        def reduce_wrapper(a, axis=0, dtype=None, out=None, keepdims=False,
+                           context=None):
             return reduce_with_cmp(a, cmp, npy_ufunc.identity, axis, dtype,
-                                   out, keepdims)
+                                   out, keepdims, context)
 
         return reduce_wrapper
 
@@ -1158,7 +1176,7 @@ def reduce_wrapper(a, axis=0, dtype=None, out=None, keepdims=False):
 class UfuncBase(object):
 
     def __init__(self, name, nin, nout, call, **kwargs):
-        self.name = name
+        self.__name__ = name
         self.nin = nin
         self.nout = nout
         self.nargs = self.nin + self.nout
@@ -1168,21 +1186,22 @@ def __init__(self, name, nin, nout, call, **kwargs):
 
         # Wrappers for unimplemented stuff
 
-        def _at_not_impl(a, indices, b=None):
+        def _at_not_impl(a, indices, b=None, context=None):
             return NotImplemented
 
         def _accumulate_not_impl(array, axis=0, dtype=None, out=None,
-                                 keepdims=None):
+                                 keepdims=None, context=None):
             return NotImplemented
 
-        def _outer_not_impl(A, B, **kwargs):
+        def _outer_not_impl(A, B, context=None, **kwargs):
             return NotImplemented
 
         def _reduce_not_impl(a, axis=0, dtype=None, out=None,
-                             keepdims=False):
+                             keepdims=False, context=None):
             return NotImplemented
 
-        def _reduceat_not_impl(a, indices, axis=0, dtype=None, out=None):
+        def _reduceat_not_impl(a, indices, axis=0, dtype=None, out=None,
+                               context=None):
             return NotImplemented
 
         self.accumulate = kwargs.pop('accumulate', _accumulate_not_impl)
@@ -1215,108 +1234,80 @@ def _reduceat_not_impl(a, indices, axis=0, dtype=None, out=None):
             self.reduceat.__qualname__ = name + '.reduceat'
 
     def __repr__(self):
-        return '<ufunc {}>'.format(self.name)
+        return '<ufunc {}>'.format(self.__name__)
 
 
 class Ufunc01(UfuncBase):
 
-    def __call__(self, out=None):
-        return self._call(out=out)
+    def __call__(self, out=None, context=None):
+        return self._call(out=out, context=context)
 
 
 class Ufunc11(UfuncBase):
 
-    def __call__(self, x, out=None):
-        return self._call(x, out=out)
+    def __call__(self, x, out=None, context=None):
+        return self._call(x, out=out, context=context)
 
 
 class Ufunc12(UfuncBase):
 
-    def __call__(self, x, out1=None, out2=None):
-        return self._call(x, out1=out1, out2=out2)
+    def __call__(self, x, out1=None, out2=None, context=None):
+        return self._call(x, out1=out1, out2=out2, context=context)
 
 
 class Ufunc21(UfuncBase):
 
-    def __call__(self, x1, x2, out=None):
-        return self._call(x1, x2, out=out)
+    def __call__(self, x1, x2, out=None, context=None):
+        return self._call(x1, x2, out=out, context=context)
 
 
 # --- Add ufuncs to global namespace --- #
 
 
-def make_unary_ufunc(name, doc):
+def make_ufunc(name):
+    npy_ufunc = getattr(numpy, name)
+    nin = npy_ufunc.nin
+    nout = npy_ufunc.nout
 
-    def wrapper(a, out=None):
-        return unary_ufunc(a, name, out)
+    if nin == 1 and nout == 1:
+        cls = Ufunc11
 
-    wrapper.__name__ = name
-    if PY3:
-        wrapper.__qualname__ = name
-    wrapper.__doc__ = doc
-    return wrapper
+        def call(a, out=None, context=None):
+            return unary_ufunc(a, name, out, context)
 
+    elif nin == 1 and nout == 2:
+        cls = Ufunc12
 
-# Add the ufuncs to the module dictionary
-for ufunc_name in UNARY_UFUNCS:
-    npy_ufunc = getattr(numpy, ufunc_name)
-    assert npy_ufunc.nin == 1
-    assert npy_ufunc.nout == 1
-    descr = npy_ufunc.__doc__.splitlines()[2]
-    # Numpy occasionally uses single ticks for doc, we only use them for links
-    doc = re.sub('`+', '``', descr)
-    ufunc = Ufunc11(ufunc_name, nin=1, nout=1,
-                    call=make_unary_ufunc(ufunc_name, doc),
-                    identity=npy_ufunc.identity)
-    globals()[ufunc_name] = ufunc
+        def call(a, out1=None, out2=None, context=None):
+            return unary_ufunc_two_out(a, name, out1, out2, context)
 
+    elif nin == 2 and nout == 1:
+        cls = Ufunc21
 
-def make_unary_ufunc_two_out(name, doc):
+        def call(a, b, out=None, context=None):
+            return binary_ufunc(a, b, name, out, context)
 
-    def wrapper(a, out1=None, out2=None):
-        return unary_ufunc_two_out(a, name, out1, out2)
+    else:
+        raise NotImplementedError('nin = {}, nout = {} not supported'
+                                  ''.format(nin, nout))
 
-    wrapper.__name__ = name
+    call.__name__ = name
     if PY3:
-        wrapper.__qualname__ = name
-    wrapper.__doc__ = doc
-    return wrapper
+        call.__qualname__ = name
+    # TODO: add docstring
 
+    if nin == 1:
+        ufunc = cls(name, nin, nout, call, identity=npy_ufunc.identity)
+    else:
+        ufunc = cls(name, nin, nout, call, identity=npy_ufunc.identity,
+                    reduce=make_binary_ufunc_reduce(name))
 
-# Add the ufuncs to the module dictionary
-for ufunc_name in UNARY_UFUNCS_TWO_OUT:
-    npy_ufunc = getattr(numpy, ufunc_name)
-    descr = npy_ufunc.__doc__.splitlines()[2]
-    # Numpy occasionally uses single ticks for doc, we only use them for links
-    doc = re.sub('`+', '``', descr)
-    ufunc = Ufunc12(ufunc_name, nin=1, nout=2,
-                    call=make_unary_ufunc_two_out(ufunc_name, doc),
-                    identity=npy_ufunc.identity)
-    globals()[ufunc_name] = ufunc
-
-
-def make_binary_ufunc(name, doc):
-
-    def wrapper(a, b, out=None):
-        return binary_ufunc(a, b, name, out)
-
-    wrapper.__name__ = name
-    if PY3:
-        wrapper.__qualname__ = name
-    wrapper.__doc__ = doc
-    return wrapper
+    return ufunc
 
 
-for ufunc_name in BINARY_UFUNCS:
-    npy_ufunc = getattr(numpy, ufunc_name)
-    descr = npy_ufunc.__doc__.splitlines()[2]
-    # Numpy occasionally uses single ticks for doc, we only use them for links
-    doc = re.sub('`+', '``', descr)
-    ufunc = Ufunc21(ufunc_name, nin=2, nout=1,
-                    call=make_binary_ufunc(ufunc_name, doc),
-                    identity=npy_ufunc.identity,
-                    reduce=make_binary_ufunc_reduce(ufunc_name))
-    globals()[ufunc_name] = ufunc
+# Add the ufuncs to the module dictionary
+for name in UNARY_UFUNCS + UNARY_UFUNCS_TWO_OUT + BINARY_UFUNCS:
+    globals()[name] = make_ufunc(name)
 
 
 for name, alt_name in UFUNC_SYNONYMS:
@@ -1325,6 +1316,5 @@ def wrapper(a, b, out=None):
 
 if __name__ == '__main__':
     from doctest import testmod, NORMALIZE_WHITESPACE
-    import numpy
     optionflags = NORMALIZE_WHITESPACE
     testmod(optionflags=NORMALIZE_WHITESPACE, extraglobs={'np': numpy})

From efea028ed81f1f10d61a374739aa06f6c75e78f6 Mon Sep 17 00:00:00 2001
From: Holger Kohr <h.kohr@cwi.nl>
Date: Thu, 6 Jul 2017 11:48:11 +0200
Subject: [PATCH 29/29] MAINT: remove debug print statement from
 gpuarray_buffer_cuda.c

---
 src/gpuarray_buffer_cuda.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c
index 4bfb128f36..8f6e2cdb6f 100644
--- a/src/gpuarray_buffer_cuda.c
+++ b/src/gpuarray_buffer_cuda.c
@@ -1076,8 +1076,6 @@ static int call_compiler(cuda_context *ctx, strb *src, strb *ptx, strb *log) {
     , "-G", "-lineinfo"
   };
   nvrtcResult err;
-  // DEBUG: print the kernel source
-  // printf("%s\n", src);
 
   opts[1] = ctx->bin_id;