diff --git a/conf/benchmarks.lst b/conf/benchmarks.lst index 3ef7717b5..4d62ba3a4 100644 --- a/conf/benchmarks.lst +++ b/conf/benchmarks.lst @@ -24,5 +24,6 @@ ml.linear ml.lda ml.svm ml.gmm +ml.xgboost -graph.nweight \ No newline at end of file +graph.nweight diff --git a/conf/workloads/ml/xgboost.conf b/conf/workloads/ml/xgboost.conf index 7788f6242..9c04e6da6 100644 --- a/conf/workloads/ml/xgboost.conf +++ b/conf/workloads/ml/xgboost.conf @@ -17,7 +17,7 @@ hibench.xgboost.features ${hibench.xgboost.${hibench.scale.pr hibench.xgboost.partitions ${hibench.default.map.parallelism} hibench.xgboost.numClasses 2 -hibench.xgboost.maxDepth 30 +hibench.xgboost.maxDepth 8 hibench.xgboost.maxBins 32 hibench.xgboost.numIterations 20 hibench.xgboost.learningRate 0.1 diff --git a/pom.xml b/pom.xml index 6bfe47d04..4e90e1178 100644 --- a/pom.xml +++ b/pom.xml @@ -79,6 +79,17 @@ Scala-tools Maven 2 Repository https://oss.sonatype.org/content/groups/scala-tools/ + + xgboostrepo + XGBoost Maven Repo + https://s3-us-west-2.amazonaws.com/xgboost-maven-repo/release + + true + + + false + + diff --git a/sparkbench/ml/pom.xml b/sparkbench/ml/pom.xml index 44ae66cc9..32075c625 100644 --- a/sparkbench/ml/pom.xml +++ b/sparkbench/ml/pom.xml @@ -56,12 +56,12 @@ ml.dmlc xgboost4j_${scala.binary.version} - 1.0.0 + 1.1.0 ml.dmlc xgboost4j-spark_${scala.binary.version} - 1.0.0 + 1.1.0 diff --git a/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/XGBoost.scala b/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/XGBoost.scala index 9f68e5abd..7664dab1d 100644 --- a/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/XGBoost.scala +++ b/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/XGBoost.scala @@ -18,11 +18,9 @@ package com.intel.hibench.sparkbench.ml import org.apache.spark.{SparkConf, SparkContext} -import org.apache.spark.mllib.tree.GradientBoostedTrees -import org.apache.spark.mllib.tree.configuration.BoostingStrategy -import org.apache.spark.mllib.tree.model.GradientBoostedTreesModel import org.apache.spark.rdd.RDD import org.apache.spark.mllib.regression.LabeledPoint +import org.apache.spark.ml.Pipeline import org.apache.spark.ml.feature.{LabeledPoint => NewLabeledPoint} import ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator @@ -33,7 +31,7 @@ object XGBoost { case class Params( numClasses: Int = 2, - maxDepth: Int = 30, + maxDepth: Int = 8, maxBins: Int = 32, numIterations: Int = 20, learningRate: Double = 0.1, @@ -93,7 +91,7 @@ object XGBoost { val mllibRDD: RDD[LabeledPoint] = sc.objectFile(dataPath) // Convert to ML LabeledPoint and to DataFrame val mlRDD: RDD[NewLabeledPoint] = mllibRDD.map { p => NewLabeledPoint(p.label, p.features.asML) } - val data = mlRDD.toDF + val data = mlRDD.toDF("label", "features") // Split the data into training and test sets (30% held out for testing) val splits = data.randomSplit(Array(0.7, 0.3)) @@ -123,7 +121,9 @@ object XGBoost { setFeaturesCol("features"). setLabelCol("label") - val model = xgbClassifier.fit(trainingData) + val pipeline = new Pipeline().setStages(Array(xgbClassifier)) + + val model = pipeline.fit(trainingData) // Make predictions. val predictions = model.transform(testData) diff --git a/travis/benchmarks_ml.lst b/travis/benchmarks_ml.lst index 6e4894a5a..362df64d4 100644 --- a/travis/benchmarks_ml.lst +++ b/travis/benchmarks_ml.lst @@ -9,3 +9,4 @@ ml.linear ml.lda ml.svm ml.gmm +ml.xgboost