Implement overwrite function in extract-json feature.

datastax · Aug 28, 2024 · b1a8192 · b1a8192
1 parent d7a543b
commit b1a8192
Show file tree

Hide file tree

Showing 4 changed files with 25 additions and 3 deletions.
diff --git a/src/main/java/com/datastax/cdm/feature/ExtractJson.java b/src/main/java/com/datastax/cdm/feature/ExtractJson.java
@@ -40,6 +40,7 @@ public class ExtractJson extends AbstractFeature {
 
 	private String targetColumnName = "";
 	private Integer targetColumnIndex = -1;
+	private boolean overwriteTarget = false;
 
 	@Override
 	public boolean loadProperties(IPropertyHelper helper) {
@@ -49,6 +50,8 @@ public boolean loadProperties(IPropertyHelper helper) {
 
 		originColumnName = getColumnName(helper, KnownProperties.EXTRACT_JSON_ORIGIN_COLUMN_NAME);
 		targetColumnName = getColumnName(helper, KnownProperties.EXTRACT_JSON_TARGET_COLUMN_MAPPING);
+		overwriteTarget = helper.getBoolean(KnownProperties.EXTRACT_JSON_TARGET_OVERWRITE);
+
 		// Convert columnToFieldMapping to targetColumnName and originJsonFieldName
 		if (!targetColumnName.isBlank()) {
 			String[] parts = targetColumnName.split("\\:");
@@ -146,6 +149,10 @@ public String getTargetColumnName() {
 		return isEnabled ? targetColumnName : "";
 	}
 
+	public boolean overwriteTarget() {
+		return overwriteTarget;
+	}
+
 	private String getColumnName(IPropertyHelper helper, String colName) {
 		String columnName = CqlTable.unFormatName(helper.getString(colName));
 		return (null == columnName) ? "" : columnName;

diff --git a/src/main/java/com/datastax/cdm/job/DiffJobSession.java b/src/main/java/com/datastax/cdm/job/DiffJobSession.java
@@ -66,6 +66,7 @@ public class DiffJobSession extends CopyJobSession {
 	boolean logDebug = logger.isDebugEnabled();
 	boolean logTrace = logger.isTraceEnabled();
 	private ExtractJson extractJsonFeature;
+	private boolean overwriteTarget;
 
 	public DiffJobSession(CqlSession originSession, CqlSession targetSession, SparkConf sc) {
 		super(originSession, targetSession, sc);
@@ -111,6 +112,7 @@ public DiffJobSession(CqlSession originSession, CqlSession targetSession, SparkC
 		}
 
 		extractJsonFeature = (ExtractJson) this.targetSession.getCqlTable().getFeature(Featureset.EXTRACT_JSON);
+		overwriteTarget = extractJsonFeature.isEnabled() && extractJsonFeature.overwriteTarget();
 
 		logger.info("CQL -- origin select: {}", this.originSession.getOriginSelectByPartitionRangeStatement().getCQL());
 		logger.info("CQL -- target select: {}", this.targetSession.getTargetSelectByPKStatement().getCQL());
@@ -270,7 +272,13 @@ private String isDifferent(Record record) {
 							logger.trace("PK {}, targetIndex {} skipping constant column {}", pk, targetIndex,
 									targetColumnNames.get(targetIndex));
 						return; // nothing to compare in origin
-					} else if (targetIndex == extractJsonFeature.getTargetColumnIndex()) {
+					} 
+
+					targetAsOriginType = targetSession.getCqlTable().getAndConvertData(targetIndex, targetRow);
+					if (targetIndex == extractJsonFeature.getTargetColumnIndex()) {
+						if (!overwriteTarget && null != targetAsOriginType) {
+							return; // skip validation when target has data
+						}
 						originIndex = extractJsonFeature.getOriginColumnIndex();
 						origin = extractJsonFeature.extract(originRow.getString(originIndex));
 					} else {
@@ -301,7 +309,6 @@ private String isDifferent(Record record) {
 									+ explodeMapKeyIndex + ", valueIndex:" + explodeMapValueIndex + ")");
 						}
 					}
-					targetAsOriginType = targetSession.getCqlTable().getAndConvertData(targetIndex, targetRow);
 
 					if (logDebug)
 						logger.debug(

diff --git a/src/main/java/com/datastax/cdm/properties/KnownProperties.java b/src/main/java/com/datastax/cdm/properties/KnownProperties.java
@@ -243,13 +243,15 @@ public enum PropertyType {
 	public static final String EXTRACT_JSON_EXCLUSIVE = "spark.cdm.feature.extractJson.exclusive";
 	public static final String EXTRACT_JSON_ORIGIN_COLUMN_NAME = "spark.cdm.feature.extractJson.originColumn";
 	public static final String EXTRACT_JSON_TARGET_COLUMN_MAPPING = "spark.cdm.feature.extractJson.propertyMapping";
+	public static final String EXTRACT_JSON_TARGET_OVERWRITE = "spark.cdm.feature.extractJson.overwrite";
 
 	static {
 		types.put(EXTRACT_JSON_EXCLUSIVE, PropertyType.BOOLEAN);
 		defaults.put(EXTRACT_JSON_EXCLUSIVE, "false");
 		types.put(EXTRACT_JSON_ORIGIN_COLUMN_NAME, PropertyType.STRING);
 		types.put(EXTRACT_JSON_TARGET_COLUMN_MAPPING, PropertyType.STRING);
-	}
+		types.put(EXTRACT_JSON_TARGET_OVERWRITE, PropertyType.BOOLEAN);
+		defaults.put(EXTRACT_JSON_TARGET_OVERWRITE, "false");	}
 
 	// ==========================================================================
 	// Guardrail Feature

diff --git a/src/resources/cdm-detailed.properties b/src/resources/cdm-detailed.properties
@@ -388,10 +388,16 @@ spark.cdm.perfops.ratelimit.target                20000
 #                        - If the specified JSON property does not exist in the JSON content, the Target column 
 #                          will be set to null.
 #                        Note: This feature currently supports extraction of only one JSON property.
+#
+#   .overwrite           Default is false. This property only applies to Validation run (NA for Migration) 
+#                        When set to true, the extracted JSON value will overwrite any existing value in the 
+#                        Target column during Validation. False will skip validation if the Target column has
+#                        any non-null value.
 #-----------------------------------------------------------------------------------------------------------
 #spark.cdm.feature.extractJson.exclusive         false
 #spark.cdm.feature.extractJson.originColumn      origin_columnname_with_json_content
 #spark.cdm.feature.extractJson.propertyMapping   origin_json_propertyname:target_columnname
+#spark.cdm.feature.extractJson.overwrite         false
 
 #===========================================================================================================
 # Guardrail feature manages records that exceed guardrail checks. The Guardrail job will generate a