Skip to content

Commit

Permalink
release 0.3.0 (#12)
Browse files Browse the repository at this point in the history
* release 0.3.0

* fix black/click

* update mypy

* protobuf dep fixes
  • Loading branch information
crflynn authored Apr 19, 2022
1 parent 7ae519e commit eab1cd7
Show file tree
Hide file tree
Showing 6 changed files with 214 additions and 281 deletions.
10 changes: 8 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
# Changelog

## 2021-12-05
## 2022-04-19 - 0.3.0

* Breaking: protobuf bytes fields will now convert directly to spark ByteType and vice versa.
* Relax constraint on pyspark
* Bump minimum protobuf version to 3.20.0

## 2021-12-05 - 0.2.0

* Add `to_protobuf` method to encode pyspark structs to protobuf

## 2021-12-01
## 2021-12-01 - 0.1.0

* initial release
91 changes: 47 additions & 44 deletions example/example_pb2.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,16 @@ import google.protobuf.timestamp_pb2
import typing
import typing_extensions

DESCRIPTOR: google.protobuf.descriptor.FileDescriptor = ...
DESCRIPTOR: google.protobuf.descriptor.FileDescriptor

class SimpleMessage(google.protobuf.message.Message):
DESCRIPTOR: google.protobuf.descriptor.Descriptor = ...
DESCRIPTOR: google.protobuf.descriptor.Descriptor
NAME_FIELD_NUMBER: builtins.int
QUANTITY_FIELD_NUMBER: builtins.int
MEASURE_FIELD_NUMBER: builtins.int
name: typing.Text = ...
quantity: builtins.int = ...
measure: builtins.float = ...
name: typing.Text
quantity: builtins.int
measure: builtins.float
def __init__(
self,
*,
Expand All @@ -39,11 +39,11 @@ class SimpleMessage(google.protobuf.message.Message):
global___SimpleMessage = SimpleMessage

class NestedMessage(google.protobuf.message.Message):
DESCRIPTOR: google.protobuf.descriptor.Descriptor = ...
DESCRIPTOR: google.protobuf.descriptor.Descriptor
KEY_FIELD_NUMBER: builtins.int
VALUE_FIELD_NUMBER: builtins.int
key: typing.Text = ...
value: typing.Text = ...
key: typing.Text
value: typing.Text
def __init__(
self,
*,
Expand All @@ -57,9 +57,9 @@ class NestedMessage(google.protobuf.message.Message):
global___NestedMessage = NestedMessage

class DecimalMessage(google.protobuf.message.Message):
DESCRIPTOR: google.protobuf.descriptor.Descriptor = ...
DESCRIPTOR: google.protobuf.descriptor.Descriptor
VALUE_FIELD_NUMBER: builtins.int
value: typing.Text = ...
value: typing.Text
def __init__(
self,
*,
Expand All @@ -72,28 +72,31 @@ class DecimalMessage(google.protobuf.message.Message):
global___DecimalMessage = DecimalMessage

class ExampleMessage(google.protobuf.message.Message):
DESCRIPTOR: google.protobuf.descriptor.Descriptor = ...
class SomeEnum(_SomeEnum, metaclass=_SomeEnumEnumTypeWrapper):
pass
DESCRIPTOR: google.protobuf.descriptor.Descriptor
class _SomeEnum:
V = typing.NewType("V", builtins.int)
ValueType = typing.NewType("ValueType", builtins.int)
V: typing_extensions.TypeAlias = ValueType
class _SomeEnumEnumTypeWrapper(
google.protobuf.internal.enum_type_wrapper._EnumTypeWrapper[_SomeEnum.V],
google.protobuf.internal.enum_type_wrapper._EnumTypeWrapper[
ExampleMessage._SomeEnum.ValueType
],
builtins.type,
):
DESCRIPTOR: google.protobuf.descriptor.EnumDescriptor = ...
unspecified = ExampleMessage.SomeEnum.V(0)
first = ExampleMessage.SomeEnum.V(1)
second = ExampleMessage.SomeEnum.V(2)
unspecified = ExampleMessage.SomeEnum.V(0)
first = ExampleMessage.SomeEnum.V(1)
second = ExampleMessage.SomeEnum.V(2)
DESCRIPTOR: google.protobuf.descriptor.EnumDescriptor
unspecified: ExampleMessage._SomeEnum.ValueType # 0
first: ExampleMessage._SomeEnum.ValueType # 1
second: ExampleMessage._SomeEnum.ValueType # 2
class SomeEnum(_SomeEnum, metaclass=_SomeEnumEnumTypeWrapper):
pass
unspecified: ExampleMessage.SomeEnum.ValueType # 0
first: ExampleMessage.SomeEnum.ValueType # 1
second: ExampleMessage.SomeEnum.ValueType # 2
class MapEntry(google.protobuf.message.Message):
DESCRIPTOR: google.protobuf.descriptor.Descriptor = ...
DESCRIPTOR: google.protobuf.descriptor.Descriptor
KEY_FIELD_NUMBER: builtins.int
VALUE_FIELD_NUMBER: builtins.int
key: typing.Text = ...
value: typing.Text = ...
key: typing.Text
value: typing.Text
def __init__(
self,
*,
Expand Down Expand Up @@ -128,15 +131,15 @@ class ExampleMessage(google.protobuf.message.Message):
TIMESTAMP_FIELD_NUMBER: builtins.int
DURATION_FIELD_NUMBER: builtins.int
DECIMAL_FIELD_NUMBER: builtins.int
int32: builtins.int = ...
int64: builtins.int = ...
uint32: builtins.int = ...
uint64: builtins.int = ...
double: builtins.float = ...
float: builtins.float = ...
bool: builtins.bool = ...
enum: global___ExampleMessage.SomeEnum.V = ...
string: typing.Text = ...
int32: builtins.int
int64: builtins.int
uint32: builtins.int
uint64: builtins.int
double: builtins.float
float: builtins.float
bool: builtins.bool
enum: global___ExampleMessage.SomeEnum.ValueType
string: typing.Text
@property
def nested(self) -> global___NestedMessage: ...
@property
Expand All @@ -145,15 +148,15 @@ class ExampleMessage(google.protobuf.message.Message):
) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[
typing.Text
]: ...
bytes: builtins.bytes = ...
sfixed32: builtins.int = ...
sfixed64: builtins.int = ...
sint32: builtins.int = ...
sint64: builtins.int = ...
fixed32: builtins.int = ...
fixed64: builtins.int = ...
oneofstring: typing.Text = ...
oneofint32: builtins.int = ...
bytes: builtins.bytes
sfixed32: builtins.int
sfixed64: builtins.int
sint32: builtins.int
sint64: builtins.int
fixed32: builtins.int
fixed64: builtins.int
oneofstring: typing.Text
oneofint32: builtins.int
@property
def map(
self,
Expand All @@ -174,7 +177,7 @@ class ExampleMessage(google.protobuf.message.Message):
double: builtins.float = ...,
float: builtins.float = ...,
bool: builtins.bool = ...,
enum: global___ExampleMessage.SomeEnum.V = ...,
enum: global___ExampleMessage.SomeEnum.ValueType = ...,
string: typing.Text = ...,
nested: typing.Optional[global___NestedMessage] = ...,
stringlist: typing.Optional[typing.Iterable[typing.Text]] = ...,
Expand Down
17 changes: 10 additions & 7 deletions pbspark/_proto.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,10 @@
from functools import wraps

from google.protobuf import json_format
from google.protobuf.descriptor import Descriptor
from google.protobuf.descriptor import FieldDescriptor
from google.protobuf.descriptor_pool import DescriptorPool
from google.protobuf.message import Message
from google.protobuf.pyext._message import Descriptor # type: ignore
from google.protobuf.timestamp_pb2 import Timestamp
from pyspark.sql import Column
from pyspark.sql.functions import col
Expand Down Expand Up @@ -73,11 +74,11 @@ def __init__(self, custom_deserializers=None, **kwargs):
self._custom_deserializers = custom_deserializers or {}
super().__init__(**kwargs)

def ConvertMessage(self, value, message):
def ConvertMessage(self, value, message, path):
full_name = message.DESCRIPTOR.full_name
if full_name in self._custom_deserializers:
return self._custom_deserializers[full_name](value)
return super().ConvertMessage(value, message)
return super().ConvertMessage(value, message, path)


# protobuf converts to/from b64 strings, but we prefer to stay as bytes.
Expand Down Expand Up @@ -190,16 +191,18 @@ def parse_dict(
self,
value: dict,
message: Message,
ignore_unknown_fields=False,
descriptor_pool=None,
ignore_unknown_fields: bool = False,
descriptor_pool: t.Optional[DescriptorPool] = None,
max_recursion_depth: int = 100,
):
"""Custom ParseDict using overridden parser."""
parser = _Parser(
custom_deserializers=self._custom_deserializers,
ignore_unknown_fields=ignore_unknown_fields,
descriptor_pool=descriptor_pool,
max_recursion_depth=max_recursion_depth,
)
return parser.ConvertMessage(value=value, message=message)
return parser.ConvertMessage(value=value, message=message, path=None)

def get_spark_schema(
self,
Expand All @@ -218,7 +221,7 @@ def get_spark_schema(
if inspect.isclass(descriptor) and issubclass(descriptor, Message):
descriptor_ = descriptor.DESCRIPTOR
else:
descriptor_ = descriptor
descriptor_ = descriptor # type: ignore[assignment]
for field in descriptor_.fields:
spark_type: DataType
if field.cpp_type == FieldDescriptor.CPPTYPE_MESSAGE:
Expand Down
2 changes: 1 addition & 1 deletion pbspark/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.2.0"
__version__ = "0.3.0"
Loading

0 comments on commit eab1cd7

Please sign in to comment.