数据清洗存储到目标地址,提交JAR包到集群上执行报错,本地WINDOWS环境下解析日志没有问题。
烦请指导,谢谢~
2)报错信息:
java.lang.ClassNotFoundException: org.apache.poi.openxml4j.exceptions.InvalidFormatException
3)代码:
import org.apache.spark.sql.SparkSession
object SparkStatCleanJob {
def main(args: Array[String]): Unit = {
val path1 = "file:///home/hadoop/data/access.log"
val path2 = "file:///home/hadoop/data/accessout"
val spark = SparkSession.builder().appName(“SparkStatCleanJob”).
master(“local[2]”).getOrCreate()
val accessRDD = spark.sparkContext.textFile(path1)
val accessRowRDD = accessRDD.map(x => AccessConvertUtils.parseLog(x))
val accessDF = spark.createDataFrame(accessRowRDD, AccessConvertUtils.structType)
// accessDF.printSchema()
// accessDF.show(100,false)
//coalesce控制文件输出大小,输出1个文件
accessDF.coalesce(1).write.format(“parquet”).mode(“Overwrite”).
partitionBy(“day”).save(path2)
spark.stop()
}
}