qubole · maheshk114 · Jul 7, 2020 · Jul 26, 2020 · Aug 24, 2020 · Sep 26, 2020
diff --git a/src/it/scala/com/qubole/spark/hiveacid/LockSuite.scala b/src/it/scala/com/qubole/spark/hiveacid/LockSuite.scala
@@ -183,6 +183,8 @@ class TestLockHelper extends TestHelper {
       .config("spark.hadoop.hive.txn.timeout", "6")
       //.config("spark.ui.enabled", "true")
       //.config("spark.ui.port", "4041")
+      // All V1 tests are executed USING HiveAcid
+      .config("spark.hive.acid.datasource.version", "v2")
       .enableHiveSupport()
       .getOrCreate()
   }

diff --git a/src/it/scala/com/qubole/spark/hiveacid/ReadSuite.scala b/src/it/scala/com/qubole/spark/hiveacid/ReadSuite.scala
@@ -26,7 +26,7 @@ import org.scalatest._
 
 import scala.util.control.NonFatal
 
-@Ignore
+//@Ignore
 class ReadSuite extends FunSuite with BeforeAndAfterEach with BeforeAndAfterAll {
 
   val log: Logger = LogManager.getLogger(this.getClass)
@@ -222,9 +222,10 @@ class ReadSuite extends FunSuite with BeforeAndAfterEach with BeforeAndAfterAll
           // Special case of comparing result read before conversion
           // and after conversion.
           log.info("++ Compare result across conversion")
-          val (dfFromSql, dfFromScala) = helper.sparkGetDF(table)
+          val (dfFromSql, dfFromScala, dfFromSqlV2) = helper.sparkGetDF(table)
           helper.compareResult(hiveResStr, dfFromSql.collect())
           helper.compareResult(hiveResStr, dfFromScala.collect())
+          helper.compareResult(hiveResStr, dfFromSqlV2.collect())
 
           helper.verify(table, insertOnly = false)
         }
@@ -272,21 +273,22 @@ class ReadSuite extends FunSuite with BeforeAndAfterEach with BeforeAndAfterAll
 
           val hiveResStr = helper.hiveExecuteQuery(table.hiveSelect)
 
-          val (df1, df2) = helper.sparkGetDF(table)
+          val (df1, df2, dfV2) = helper.sparkGetDF(table)
 
           // Materialize it once
           helper.compareResult(hiveResStr, df1.collect())
           helper.compareResult(hiveResStr, df2.collect())
+          helper.compareResult(hiveResStr, dfV2.collect())
 
           helper.hiveExecute(table.insertIntoHiveTableKey(11))
           helper.hiveExecute(table.insertIntoHiveTableKey(12))
           helper.hiveExecute(table.insertIntoHiveTableKey(13))
           helper.hiveExecute(table.insertIntoHiveTableKey(14))
           helper.hiveExecute(table.insertIntoHiveTableKey(15))
           if (isPartitioned) {
-            compactPartitionedAndTest(hiveResStr, df1, df2, Seq(11,12,13,14,15))
+            compactPartitionedAndTest(hiveResStr, df1, df2, dfV2, Seq(11,12,13,14,15))
           } else {
-            compactAndTest(hiveResStr, df1, df2)
+            compactAndTest(hiveResStr, df1, df2, dfV2)
           }
 
           // Shortcut for insert Only
@@ -296,43 +298,49 @@ class ReadSuite extends FunSuite with BeforeAndAfterEach with BeforeAndAfterAll
             helper.hiveExecute(table.deleteFromHiveTableKey(5))
             helper.hiveExecute(table.deleteFromHiveTableKey(6))
             if (isPartitioned) {
-              compactPartitionedAndTest(hiveResStr, df1, df2, Seq(3,4,5,6))
+              compactPartitionedAndTest(hiveResStr, df1, df2, dfV2, Seq(3,4,5,6))
             } else {
-              compactAndTest(hiveResStr, df1, df2)
+              compactAndTest(hiveResStr, df1, df2, dfV2)
             }
 
             helper.hiveExecute(table.updateInHiveTableKey(7))
             helper.hiveExecute(table.updateInHiveTableKey(8))
             helper.hiveExecute(table.updateInHiveTableKey(9))
             helper.hiveExecute(table.updateInHiveTableKey(10))
             if (isPartitioned) {
-              compactPartitionedAndTest(hiveResStr, df1, df2, Seq(7,8,9,10))
+              compactPartitionedAndTest(hiveResStr, df1, df2, dfV2, Seq(7,8,9,10))
             } else {
-              compactAndTest(hiveResStr, df1, df2)
+              compactAndTest(hiveResStr, df1, df2, dfV2)
             }
           }
         }
 
-        def compactAndTest(hiveResStr: String, df1: DataFrame, df2: DataFrame): Unit = {
+        def compactAndTest(hiveResStr: String, df1: DataFrame, df2: DataFrame, dfV2: DataFrame): Unit = {
           helper.compareResult(hiveResStr, df1.collect())
           helper.compareResult(hiveResStr, df2.collect())
+          helper.compareResult(hiveResStr, dfV2.collect())
           helper.hiveExecute(table.minorCompaction)
           helper.compareResult(hiveResStr, df1.collect())
           helper.compareResult(hiveResStr, df2.collect())
+          helper.compareResult(hiveResStr, dfV2.collect())
           helper.hiveExecute(table.majorCompaction)
           helper.compareResult(hiveResStr, df1.collect())
           helper.compareResult(hiveResStr, df2.collect())
+          helper.compareResult(hiveResStr, dfV2.collect())
         }
 
-        def compactPartitionedAndTest(hiveResStr: String, df1: DataFrame, df2: DataFrame, keys: Seq[Int]): Unit = {
+        def compactPartitionedAndTest(hiveResStr: String, df1: DataFrame, df2: DataFrame, dfV2: DataFrame, keys: Seq[Int]): Unit = {
           helper.compareResult(hiveResStr, df1.collect())
           helper.compareResult(hiveResStr, df2.collect())
+          helper.compareResult(hiveResStr, dfV2.collect())
           keys.foreach(k => helper.hiveExecute(table.minorPartitionCompaction(k)))
           helper.compareResult(hiveResStr, df1.collect())
           helper.compareResult(hiveResStr, df2.collect())
+          helper.compareResult(hiveResStr, dfV2.collect())
           keys.foreach((k: Int) => helper.hiveExecute(table.majorPartitionCompaction(k)))
           helper.compareResult(hiveResStr, df1.collect())
           helper.compareResult(hiveResStr, df2.collect())
+          helper.compareResult(hiveResStr, dfV2.collect())
         }
 
         helper.myRun(testName, code)
@@ -365,7 +373,7 @@ class ReadSuite extends FunSuite with BeforeAndAfterEach with BeforeAndAfterAll
             helper.hiveExecute(table2.insertIntoHiveTableKeyRange(10, 25))
 
             var hiveResStr = helper.hiveExecuteQuery(Table.hiveJoin(table1, table2))
-            val sparkRes1 = helper.sparkCollect(Table.sparkJoin(table1, table2))
+            val sparkRes1 = helper.sparkCollect(Table.hiveJoin(table1, table2))
             helper.compareResult(hiveResStr, sparkRes1)
           }
 

diff --git a/src/it/scala/com/qubole/spark/hiveacid/TestHelper.scala b/src/it/scala/com/qubole/spark/hiveacid/TestHelper.scala
@@ -76,26 +76,29 @@ class TestHelper extends SQLImplicits {
   def compare(table: Table, msg: String): Unit = {
     log.info(s"Verify simple $msg")
     val hiveResStr = hiveExecuteQuery(table.hiveSelect)
-    val (dfFromSql, dfFromScala) = sparkGetDF(table)
+    val (dfFromSql, dfFromScala, dfFromSqlV2) = sparkGetDF(table)
     compareResult(hiveResStr, dfFromSql.collect())
     compareResult(hiveResStr, dfFromScala.collect())
+    compareResult(hiveResStr, dfFromSqlV2.collect())
   }
 
   // With Predicate
   private def compareWithPred(table: Table, msg: String, pred: String): Unit = {
     log.info(s"Verify with predicate $msg")
     val hiveResStr = hiveExecuteQuery(table.hiveSelectWithPred(pred))
-    val (dfFromSql, dfFromScala) = sparkGetDFWithPred(table, pred)
+    val (dfFromSql, dfFromScala, dfFromSqlV2) = sparkGetDFWithPred(table, pred)
     compareResult(hiveResStr, dfFromSql.collect())
     compareResult(hiveResStr, dfFromScala.collect())
+    compareResult(hiveResStr, dfFromSqlV2.collect())
   }
   // With Projection
   private def compareWithProj(table: Table, msg: String): Unit = {
     log.info(s"Verify with projection $msg")
     val hiveResStr = hiveExecuteQuery(table.hiveSelectWithProj)
-    val (dfFromSql, dfFromScala) = sparkGetDFWithProj(table)
+    val (dfFromSql, dfFromScala, dfFromSqlV2) = sparkGetDFWithProj(table)
     compareResult(hiveResStr, dfFromSql.collect())
     compareResult(hiveResStr, dfFromScala.collect())
+    compareResult(hiveResStr, dfFromSqlV2.collect())
   }
 
   // Compare result of 2 tables via hive
@@ -198,28 +201,31 @@ class TestHelper extends SQLImplicits {
     compareWithProj(table, "After Delete")
   }
 
-  def sparkGetDFWithProj(table: Table): (DataFrame, DataFrame) = {
+  def sparkGetDFWithProj(table: Table): (DataFrame, DataFrame, DataFrame) = {
     val dfSql = sparkSQL(table.sparkSelect)
+    val dfSqlV2 = sparkSQL(table.hiveSelect)
 
     var dfScala = spark.read.format("HiveAcid").options(Map("table" -> table.hiveTname)).load().select(table.sparkDFProj)
     dfScala = totalOrderBy(table, dfScala)
-    (dfSql, dfScala)
+    (dfSql, dfScala, dfSqlV2)
   }
 
-  def sparkGetDFWithPred(table: Table, pred: String): (DataFrame, DataFrame) = {
+  def sparkGetDFWithPred(table: Table, pred: String): (DataFrame, DataFrame, DataFrame) = {
     val dfSql = sparkSQL(table.sparkSelectWithPred(pred))
+    val dfSqlV2 = sparkSQL(table.hiveSelectWithPred(pred))
 
     var dfScala = spark.read.format("HiveAcid").options(Map("table" -> table.hiveTname)).load().where(col("intCol") < "5")
     dfScala = totalOrderBy(table, dfScala)
-    (dfSql, dfScala)
+    (dfSql, dfScala, dfSqlV2)
   }
 
-  def sparkGetDF(table: Table): (DataFrame, DataFrame) = {
+  def sparkGetDF(table: Table): (DataFrame, DataFrame, DataFrame) = {
     val dfSql = sparkSQL(table.sparkSelect)
+    val dfSqlV2 = sparkSQL(table.hiveSelect)
 
     var dfScala = spark.read.format("HiveAcid").options(Map("table" -> table.hiveTname)).load()
     dfScala = totalOrderBy(table, dfScala)
-    (dfSql, dfScala)
+    (dfSql, dfScala, dfSqlV2)
   }
 
   def sparkSQL(cmd: String): DataFrame = {

diff --git a/src/it/scala/com/qubole/spark/hiveacid/TestSparkSession.scala b/src/it/scala/com/qubole/spark/hiveacid/TestSparkSession.scala
@@ -30,6 +30,8 @@ private[hiveacid] object TestSparkSession {
       .config("spark.sql.extensions", "com.qubole.spark.hiveacid.HiveAcidAutoConvertExtension")
       //.config("spark.ui.enabled", "true")
       //.config("spark.ui.port", "4041")
+      // All V1 tests are executed USING HiveAcid
+      .config("spark.hive.acid.datasource.version", "v2")
       .enableHiveSupport()
       .getOrCreate()
     spark.sparkContext.setLogLevel("WARN")

diff --git a/src/main/scala/com/qubole/spark/hiveacid/HiveAcidAutoConvert.scala b/src/main/scala/com/qubole/spark/hiveacid/HiveAcidAutoConvert.scala
@@ -28,7 +28,10 @@ import org.apache.spark.sql.catalyst.plans.logical.{Filter, InsertIntoTable, Log
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.execution.command.DDLUtils
 import org.apache.spark.sql.execution.datasources.LogicalRelation
-import com.qubole.spark.hiveacid.datasource.HiveAcidDataSource
+import com.qubole.spark.hiveacid.datasource.{HiveAcidDataSource, HiveAcidDataSourceV2}
+import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation
+import org.apache.spark.SparkContext
+import org.apache.spark.sql.internal.HiveSerDe
 
 
 /**
@@ -43,14 +46,27 @@ case class HiveAcidAutoConvert(spark: SparkSession) extends Rule[LogicalPlan] {
     relation.tableMeta.properties.getOrElse("transactional", "false").toBoolean
   }
 
-  private def convert(relation: HiveTableRelation): LogicalRelation = {
+  private def convert(relation: HiveTableRelation): LogicalPlan = {
     val options = relation.tableMeta.properties ++
       relation.tableMeta.storage.properties ++ Map("table" -> relation.tableMeta.qualifiedName)
-
     val newRelation = new HiveAcidDataSource().createRelation(spark.sqlContext, options)
     LogicalRelation(newRelation, isStreaming = false)
   }
 
+  private def convertV2(relation: HiveTableRelation): LogicalPlan = {
+    val serde = relation.tableMeta.storage.serde.getOrElse("")
+    if (!serde.equalsIgnoreCase(HiveSerDe.sourceToSerDe("orc").get.serde.get)) {
+      // Only ORC formatted is supported as of now. If its not ORC, then fallback to
+      // datasource V1.
+      logInfo("Falling back to datasource v1 as " + serde + " is not supported by v2 reader.")
+      return convert(relation)
+    }
+    val dbName = relation.tableMeta.identifier.database.getOrElse("default")
+    val tableName = relation.tableMeta.identifier.table
+    val tableOpts = Map("database" -> dbName, "table" -> tableName)
+    DataSourceV2Relation.create(new HiveAcidDataSourceV2, tableOpts, None, None)
+  }
+
   override def apply(plan: LogicalPlan): LogicalPlan = {
     plan resolveOperators {
       // Write path
@@ -61,7 +77,11 @@ case class HiveAcidAutoConvert(spark: SparkSession) extends Rule[LogicalPlan] {
       // Read path
       case relation: HiveTableRelation
         if DDLUtils.isHiveTable(relation.tableMeta) && isConvertible(relation) =>
-        convert(relation)
+          if (spark.conf.get("spark.hive.acid.datasource.version", "v1").equals("v2")) {
+            convertV2(relation)
+          } else {
+            convert(relation)
+          }
     }
   }
 }

diff --git a/src/main/scala/com/qubole/spark/hiveacid/HiveAcidDataSourceV2Reader.scala b/src/main/scala/com/qubole/spark/hiveacid/HiveAcidDataSourceV2Reader.scala
@@ -0,0 +1,126 @@
+/*
+ * Copyright 2019 Qubole, Inc.  All rights reserved.
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.qubole.spark.hiveacid
+
+import java.lang.String.format
+import java.io.IOException
+import java.util.{ArrayList, List, Map}
+
+import org.apache.spark.sql.sources.v2.reader.DataSourceReader
+import com.qubole.spark.hiveacid.hive.{HiveAcidMetadata, HiveConverter}
+import org.apache.spark.SparkContext
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.sources.v2.DataSourceV2
+import org.apache.spark.sql.vectorized.ColumnarBatch
+import org.apache.spark.sql.sources.v2._
+import org.apache.spark.sql.sources.v2.reader.DataSourceReader
+import com.qubole.spark.hiveacid.transaction.HiveAcidTxn
+import com.qubole.spark.hiveacid.util.{SerializableConfiguration, Util}
+import org.apache.spark.internal.Logging
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.sources.Filter
+import org.apache.spark.sql.sources.v2.reader._
+import com.qubole.spark.hiveacid.reader.v2.HiveAcidInputPartitionV2
+import com.qubole.spark.hiveacid.reader.TableReader
+import com.qubole.spark.hiveacid.reader.hive.HiveAcidSearchArgument
+import com.qubole.spark.hiveacid.reader.hive.HiveAcidSearchArgument.{buildTree, castLiteralValue, getPredicateLeafType, isSearchableType, quoteAttributeNameIfNeeded}
+
+/**
+  * Data source V2 implementation for HiveACID
+*/
+class HiveAcidDataSourceV2Reader
+  extends DataSourceV2 with DataSourceReader with SupportsScanColumnarBatch
+    with SupportsPushDownRequiredColumns
+    with SupportsPushDownFilters  with Logging {
+
+  def this(options: java.util.Map[String, String],
+           sparkSession : SparkSession,
+           dbName : String,
+           tblName : String) {
+    this()
+    this.options = options
+    this.sparkSession = sparkSession
+    if (dbName != null) {
+      hiveAcidMetadata = HiveAcidMetadata.fromSparkSession(sparkSession, dbName + "." + tblName)
+    } else {
+      // If db name is null, default db is chosen.
+      hiveAcidMetadata = HiveAcidMetadata.fromSparkSession(sparkSession, tblName)
+    }
+
+    // This is a hack to prevent the following situation:
+    // Spark(v 2.4.0) creates one instance of DataSourceReader to call readSchema()
+    // and then a new instance of DataSourceReader to call pushFilters(),
+    // planBatchInputPartitions() etc. Since it uses different DataSourceReader instances,
+    // and reads schema in former instance, schema remains null in the latter instance
+    // (which causes problems for other methods). More discussion:
+    // http://apache-spark-user-list.1001560.n3.nabble.com/DataSourceV2-APIs-creating-multiple-instances-of-DataSourceReader-and-hence-not-preserving-the-state-tc33646.html
+    // Also a null check on schema is already there in readSchema() to prevent initialization
+    // more than once just in case.
+    readSchema
+  }
+
+  private var options: java.util.Map[String, String] = null
+  private var sparkSession : SparkSession = null
+
+  //The pruned schema
+  private var schema: StructType = null
+
+  private var pushedFilterArray : Array[Filter] = null
+
+  private var hiveAcidMetadata: HiveAcidMetadata = _
+
+  override def readSchema: StructType = {
+    if (schema == null) {
+      schema = hiveAcidMetadata.tableSchema
+    }
+    schema
+  }
+
+  override def planBatchInputPartitions() : java.util.List[InputPartition[ColumnarBatch]] = {
+    val factories = new java.util.ArrayList[InputPartition[ColumnarBatch]]
+    inTxn {
+      txn: HiveAcidTxn => {
+        import scala.collection.JavaConversions._
+        val reader = new TableReader(sparkSession, txn, hiveAcidMetadata)
+        val hiveReader = reader.getPartitionsV2(schema.fieldNames,
+          pushedFilterArray, new SparkAcidConf(sparkSession, options.toMap))
+        factories.addAll(hiveReader)
+      }
+    }
+    factories
+  }
+
+  private def inTxn(f: HiveAcidTxn =>  Unit): Unit = {
+    new HiveTxnWrapper(sparkSession).inTxn(f)
+  }
+
+  override def pushFilters (filters: Array[Filter]): Array[Filter] = {
+    this.pushedFilterArray = HiveAcidSearchArgument.
+      getSupportedFilters(hiveAcidMetadata.tableSchema, filters.toSeq).toArray
+    // ORC does not do row level filtering. So the filters has to be applied again.
+    filters
+  }
+
+  override def pushedFilters(): Array[Filter] = this.pushedFilterArray
+
+  override def pruneColumns(requiredSchema: StructType): Unit = {
+    this.schema = requiredSchema
+  }
+}
diff --git a/src/main/scala/com/qubole/spark/hiveacid/HiveAcidTable.scala b/src/main/scala/com/qubole/spark/hiveacid/HiveAcidTable.scala
@@ -231,7 +231,7 @@ object HiveAcidTable {
   * This wrapper can be used just once for running an operation. That operation is not allowed to recursively call this again
   * @param sparkSession
   */
-private class HiveTxnWrapper(sparkSession: SparkSession) extends Logging {
+private[hiveacid] class HiveTxnWrapper(sparkSession: SparkSession) extends Logging {
 
   private var isLocalTxn: Boolean = _
   private var curTxn: HiveAcidTxn = _