Merge pull request apache#381 from Parquet/fix_concurency_problem

fix metadata concurency problem
parthchandra · Apr 29, 2014 · 6aed528 · 6aed528
2 parents 76d05fa + 9a38aec
commit 6aed528
Show file tree

Hide file tree

Showing 4 changed files with 73 additions and 33 deletions.
diff --git a/parquet-hadoop/src/main/java/parquet/hadoop/metadata/Canonicalizer.java b/parquet-hadoop/src/main/java/parquet/hadoop/metadata/Canonicalizer.java
@@ -0,0 +1,59 @@
+/**
+ * Copyright 2014 Twitter, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package parquet.hadoop.metadata;
+
+import java.util.concurrent.ConcurrentHashMap;
+
+/**
+ * returns canonical representation of objects (similar to String.intern()) to save memory
+ * if a.equals(b) then canonicalize(a) == canonicalize(b)
+ * this class is thread safe
+ * @author Julien Le Dem
+ *
+ * @param <T>
+ */
+public class Canonicalizer<T> {
+
+  private ConcurrentHashMap<T, T> canonicals = new ConcurrentHashMap<T, T>();
+
+  /**
+   * @param value the value to canonicalize
+   * @return the corresponding canonical value
+   */
+  final public T canonicalize(T value) {
+    T canonical = canonicals.get(value);
+    if (canonical == null) {
+      value = toCanonical(value);
+      T existing = canonicals.putIfAbsent(value, value);
+      // putIfAbsent is atomic, making sure we always return the same canonical representation of the value
+      if (existing == null) {
+        canonical = value;
+      } else {
+        canonical = existing;
+      }
+    }
+    return canonical;
+  }
+
+  /**
+   * @param value the value to canonicalize if needed
+   * @return the canonicalized value
+   */
+  protected T toCanonical(T value) {
+    return value;
+  }
+}
+
diff --git a/parquet-hadoop/src/main/java/parquet/hadoop/metadata/ColumnChunkProperties.java b/parquet-hadoop/src/main/java/parquet/hadoop/metadata/ColumnChunkProperties.java
@@ -16,25 +16,17 @@
 package parquet.hadoop.metadata;
 
 import java.util.Arrays;
-import java.util.HashMap;
-import java.util.Map;
 import java.util.Set;
 
 import parquet.column.Encoding;
 import parquet.schema.PrimitiveType.PrimitiveTypeName;
 
 public class ColumnChunkProperties {
 
-  private static Map<ColumnChunkProperties, ColumnChunkProperties> cache = new HashMap<ColumnChunkProperties, ColumnChunkProperties>();
+  private static Canonicalizer<ColumnChunkProperties> properties = new Canonicalizer<ColumnChunkProperties>();
 
   public static ColumnChunkProperties get(ColumnPath path, PrimitiveTypeName type, CompressionCodecName codec, Set<Encoding> encodings) {
-    ColumnChunkProperties key = new ColumnChunkProperties(codec, path, type, encodings);
-    ColumnChunkProperties cached = cache.get(key);
-    if (cached == null) {
-      cached = key;
-      cache.put(key, cached);
-    }
-    return cached;
+    return properties.canonicalize(new ColumnChunkProperties(codec, path, type, encodings));
   }
 
   private final CompressionCodecName codec;

diff --git a/parquet-hadoop/src/main/java/parquet/hadoop/metadata/ColumnPath.java b/parquet-hadoop/src/main/java/parquet/hadoop/metadata/ColumnPath.java
@@ -16,25 +16,22 @@
 package parquet.hadoop.metadata;
 
 import java.util.Arrays;
-import java.util.HashMap;
 import java.util.Iterator;
-import java.util.Map;
 
 public final class ColumnPath implements Iterable<String> {
 
-  private static Map<ColumnPath, ColumnPath> paths = new HashMap<ColumnPath, ColumnPath>();
-
-  public static ColumnPath get(String... path){
-    ColumnPath key = new ColumnPath(path);
-    ColumnPath cached = paths.get(key);
-    if (cached == null) {
-      for (int i = 0; i < path.length; i++) {
-        path[i] = path[i].intern();
+  private static Canonicalizer<ColumnPath> paths = new Canonicalizer<ColumnPath>() {
+    protected ColumnPath toCanonical(ColumnPath value) {
+      String[] path = new String[value.p.length];
+      for (int i = 0; i < value.p.length; i++) {
+        path[i] = value.p[i].intern();
       }
-      cached = key;
-      paths.put(key, cached);
+      return new ColumnPath(path);
     }
-    return cached;
+  };
+
+  public static ColumnPath get(String... path){
+    return paths.canonicalize(new ColumnPath(path));
   }
 
   private final String[] p;

diff --git a/parquet-hadoop/src/main/java/parquet/hadoop/metadata/EncodingList.java b/parquet-hadoop/src/main/java/parquet/hadoop/metadata/EncodingList.java
@@ -16,25 +16,17 @@
 package parquet.hadoop.metadata;
 
 import java.util.Collections;
-import java.util.HashMap;
 import java.util.Iterator;
 import java.util.List;
-import java.util.Map;
 
 import parquet.column.Encoding;
 
 public class EncodingList implements Iterable<Encoding> {
 
-  private static Map<EncodingList, EncodingList> encodingLists = new HashMap<EncodingList, EncodingList>();
+  private static Canonicalizer<EncodingList> encodingLists = new Canonicalizer<EncodingList>();
 
   public static EncodingList getEncodingList(List<Encoding> encodings) {
-    EncodingList key = new EncodingList(encodings);
-    EncodingList cached = encodingLists.get(key);
-    if (cached == null) {
-      cached = key;
-      encodingLists.put(key, cached);
-    }
-    return cached;
+    return encodingLists.canonicalize(new EncodingList(encodings));
   }
 
   private final List<Encoding> encodings;