diff --git a/python/datafusion/tests/test_context.py b/python/datafusion/tests/test_context.py index df7e1813b..4e9db0881 100644 --- a/python/datafusion/tests/test_context.py +++ b/python/datafusion/tests/test_context.py @@ -303,6 +303,25 @@ def test_dataset_filter(ctx, capfd): assert result[0].column(1) == pa.array([-3]) +def test_pyarrow_predicate_pushdown_is_null(ctx, capfd): + """Ensure that pyarrow filter gets pushed down for `IsNull`""" + # create a RecordBatch and register it as a pyarrow.dataset.Dataset + batch = pa.RecordBatch.from_arrays( + [pa.array([1, 2, 3]), pa.array([4, 5, 6]), pa.array([7, None, 9])], + names=["a", "b", "c"], + ) + dataset = ds.dataset([batch]) + ctx.register_dataset("t", dataset) + # Make sure the filter was pushed down in Physical Plan + df = ctx.sql("SELECT a FROM t WHERE c is NULL") + df.explain() + captured = capfd.readouterr() + assert "filter_expr=is_null(c, {nan_is_null=false})" in captured.out + + result = df.collect() + assert result[0].column(0) == pa.array([2]) + + def test_dataset_filter_nested_data(ctx): # create Arrow StructArrays to test nested data types data = pa.StructArray.from_arrays( diff --git a/src/pyarrow_filter_expression.rs b/src/pyarrow_filter_expression.rs index fca885121..5f2c9592d 100644 --- a/src/pyarrow_filter_expression.rs +++ b/src/pyarrow_filter_expression.rs @@ -138,8 +138,13 @@ impl TryFrom<&Expr> for PyArrowFilterExpression { let expr = PyArrowFilterExpression::try_from(expr.as_ref())? .0 .into_bound(py); - // TODO: this expression does not seems like it should be `call_method0` - Ok(expr.clone().call_method1("is_null", (expr,))?) + + // https://arrow.apache.org/docs/python/generated/pyarrow.dataset.Expression.html#pyarrow.dataset.Expression.is_null + // Whether floating-point NaNs are considered null. + let nan_is_null = false; + + let res = expr.call_method1("is_null", (nan_is_null,))?; + Ok(res) } Expr::Between(Between { expr,