Skip to content

Commit

Permalink
Improve resolving encrypted columns
Browse files Browse the repository at this point in the history
  • Loading branch information
EnricoMi committed Feb 6, 2025
1 parent b7cdf84 commit 2b1f732
Showing 1 changed file with 49 additions and 8 deletions.
57 changes: 49 additions & 8 deletions cpp/src/parquet/encryption/encryption.cc
Original file line number Diff line number Diff line change
Expand Up @@ -219,21 +219,62 @@ FileEncryptionProperties::Builder* FileEncryptionProperties::Builder::encrypted_

void FileEncryptionProperties::encrypt_schema(const SchemaDescriptor& schema) {
// Check that all columns in columnEncryptionProperties exist in the schema.
auto encrypted_columns = encrypted_columns_;
// Copy the encrypted_columns map as we are going to modify it while iterating it
auto encrypted_columns = ColumnPathToEncryptionPropertiesMap(encrypted_columns_);
// if columnEncryptionProperties is empty, every column in file schema will be
// encrypted with footer key.
if (encrypted_columns.size() != 0) {
std::vector<std::string> column_path_vec;
// First, save all column paths in schema.
std::vector<std::pair<std::string, std::string>> column_path_vec;
// First, memorize all column or schema paths of the schema as dot-strings.
for (int i = 0; i < schema.num_columns(); i++) {
column_path_vec.push_back(schema.Column(i)->path()->ToDotString());
auto column = schema.Column(i);
auto column_path = column->path()->ToDotString();
auto schema_path = column->schema_path()->ToDotString();
column_path_vec.emplace_back(column_path, column_path);
if (schema_path != column_path) {
column_path_vec.emplace_back(schema_path, column_path);
}
}
// Check if column exists in schema.
// Sort them alphabetically, so that we can use binary-search and look up parent columns.
std::sort(column_path_vec.begin(), column_path_vec.end());

// Check if encrypted column exists in schema, or if it is a parent field of a column.
for (const auto& elem : encrypted_columns) {
auto it = std::find(column_path_vec.begin(), column_path_vec.end(), elem.first);
if (it == column_path_vec.end()) {
auto& encrypted_column = elem.first;
auto encrypted_column_len = encrypted_column.size();

// first we look up encrypted_columns as
// find first column that equals encrypted_column or starts with encrypted_column
auto it = std::lower_bound(
column_path_vec.begin(), column_path_vec.end(), encrypted_column,
[&](const std::pair<std::string, std::string>& item, const std::string& term) {
return item.first < term;
});
bool matches = false;

// encrypted_column encrypts column 'it' when 'it' is either equal to encrypted_column,
// or 'it' starts with encrypted_column followed by a '.'
while (it != column_path_vec.end() && (it->first == encrypted_column ||
(it->first.size() > encrypted_column_len && it->first.substr(0, encrypted_column_len) == encrypted_column && it->first.at(encrypted_column_len) == '.')
)) {
// count columns encrypted by encrypted_column
matches = true;

// add column 'it' to file_encryption_properties.encrypted_columns
// when encrypted_column is a parent column
if (it->second != encrypted_column) {
encrypted_columns_.erase(encrypted_column);
encrypted_columns_.emplace(it->second, elem.second);
}

// move to next match
++it;
}

// check encrypted_column matches any existing column
if (!matches) {
std::stringstream ss;
ss << "Encrypted column " + elem.first + " not in file schema";
ss << "Encrypted column " + encrypted_column + " not in file schema";
throw ParquetException(ss.str());
}
}
Expand Down

0 comments on commit 2b1f732

Please sign in to comment.