Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updating Python code from python2 to python3 #58

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -128,7 +128,7 @@
from pyspark.ml.clustering import KMeans
kmeans = KMeans()\
.setK(20)\
.setSeed(1L)
.setSeed(1)


# COMMAND ----------
Original file line number Diff line number Diff line change
@@ -38,7 +38,7 @@

# COMMAND ----------

print lr.explainParams()
print(lr.explainParams())


# COMMAND ----------
Original file line number Diff line number Diff line change
@@ -6,20 +6,20 @@

from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression()
print lr.explainParams() # see all parameters
print(lr.explainParams()) # see all parameters
lrModel = lr.fit(bInput)


# COMMAND ----------

print lrModel.coefficients
print lrModel.intercept
print(lrModel.coefficients)
print(lrModel.intercept)


# COMMAND ----------

summary = lrModel.summary
print summary.areaUnderROC
print(summary.areaUnderROC)
summary.roc.show()
summary.pr.show()

@@ -33,31 +33,31 @@

from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier()
print dt.explainParams()
print(dt.explainParams())
dtModel = dt.fit(bInput)


# COMMAND ----------

from pyspark.ml.classification import RandomForestClassifier
rfClassifier = RandomForestClassifier()
print rfClassifier.explainParams()
print(rfClassifier.explainParams())
trainedModel = rfClassifier.fit(bInput)


# COMMAND ----------

from pyspark.ml.classification import GBTClassifier
gbtClassifier = GBTClassifier()
print gbtClassifier.explainParams()
print(gbtClassifier.explainParams())
trainedModel = gbtClassifier.fit(bInput)


# COMMAND ----------

from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes()
print nb.explainParams()
print(nb.explainParams())
trainedModel = nb.fit(bInput.where("label != 0"))


@@ -72,9 +72,9 @@

# COMMAND ----------

print metrics.areaUnderPR
print metrics.areaUnderROC
print "Receiver Operating Characteristic"
print(metrics.areaUnderPR)
print(metrics.areaUnderROC)
print("Receiver Operating Characteristic")
metrics.roc.toDF().show()


Original file line number Diff line number Diff line change
@@ -5,18 +5,18 @@

from pyspark.ml.regression import LinearRegression
lr = LinearRegression().setMaxIter(10).setRegParam(0.3).setElasticNetParam(0.8)
print lr.explainParams()
print(lr.explainParams())
lrModel = lr.fit(df)


# COMMAND ----------

summary = lrModel.summary
summary.residuals.show()
print summary.totalIterations
print summary.objectiveHistory
print summary.rootMeanSquaredError
print summary.r2
print(summary.totalIterations)
print(summary.objectiveHistory)
print(summary.rootMeanSquaredError)
print(summary.r2)


# COMMAND ----------
@@ -28,15 +28,15 @@
.setMaxIter(10)\
.setRegParam(0.3)\
.setLinkPredictionCol("linkOut")
print glr.explainParams()
print(glr.explainParams())
glrModel = glr.fit(df)


# COMMAND ----------

from pyspark.ml.regression import DecisionTreeRegressor
dtr = DecisionTreeRegressor()
print dtr.explainParams()
print(dtr.explainParams())
dtrModel = dtr.fit(df)


@@ -45,10 +45,10 @@
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.regression import GBTRegressor
rf = RandomForestRegressor()
print rf.explainParams()
print(rf.explainParams())
rfModel = rf.fit(df)
gbt = GBTRegressor()
print gbt.explainParams()
print(gbt.explainParams())
gbtModel = gbt.fit(df)


@@ -79,11 +79,11 @@
out = model.transform(df)\
.select("prediction", "label").rdd.map(lambda x: (float(x[0]), float(x[1])))
metrics = RegressionMetrics(out)
print "MSE: " + str(metrics.meanSquaredError)
print "RMSE: " + str(metrics.rootMeanSquaredError)
print "R-squared: " + str(metrics.r2)
print "MAE: " + str(metrics.meanAbsoluteError)
print "Explained variance: " + str(metrics.explainedVariance)
print("MSE: " + str(metrics.meanSquaredError))
print("RMSE: " + str(metrics.rootMeanSquaredError))
print("R-squared: " + str(metrics.r2))
print("MAE: " + str(metrics.meanAbsoluteError))
print("Explained variance: " + str(metrics.explainedVariance))


# COMMAND ----------
Original file line number Diff line number Diff line change
@@ -15,7 +15,7 @@
.setUserCol("userId")\
.setItemCol("movieId")\
.setRatingCol("rating")
print als.explainParams()
print(als.explainParams())
alsModel = als.fit(training)
predictions = alsModel.transform(test)

Original file line number Diff line number Diff line change
@@ -18,14 +18,14 @@

from pyspark.ml.clustering import KMeans
km = KMeans().setK(5)
print km.explainParams()
print(km.explainParams())
kmModel = km.fit(sales)


# COMMAND ----------

summary = kmModel.summary
print summary.clusterSizes # number of points
print(summary.clusterSizes) # number of points
kmModel.computeCost(sales)
centers = kmModel.clusterCenters()
print("Cluster Centers: ")
@@ -43,7 +43,7 @@
# COMMAND ----------

summary = bkmModel.summary
print summary.clusterSizes # number of points
print(summary.clusterSizes) # number of points
kmModel.computeCost(sales)
centers = kmModel.clusterCenters()
print("Cluster Centers: ")
@@ -55,14 +55,14 @@

from pyspark.ml.clustering import GaussianMixture
gmm = GaussianMixture().setK(5)
print gmm.explainParams()
print(gmm.explainParams())
model = gmm.fit(sales)


# COMMAND ----------

summary = model.summary
print model.weights
print(model.weights)
model.gaussiansDF.show()
summary.cluster.show()
summary.clusterSizes
@@ -89,7 +89,7 @@

from pyspark.ml.clustering import LDA
lda = LDA().setK(10).setMaxIter(5)
print lda.explainParams()
print(lda.explainParams())
model = lda.fit(prepped)


Original file line number Diff line number Diff line change
@@ -21,9 +21,9 @@

# COMMAND ----------

print "Total Number of Stations: " + str(stationGraph.vertices.count())
print "Total Number of Trips in Graph: " + str(stationGraph.edges.count())
print "Total Number of Trips in Original Data: " + str(tripData.count())
print("Total Number of Stations: " + str(stationGraph.vertices.count()))
print("Total Number of Trips in Graph: " + str(stationGraph.edges.count()))
print("Total Number of Trips in Original Data: " + str(tripData.count()))


# COMMAND ----------
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from __future__ import print_function

if __name__ == '__main__':
from pyspark.sql import SparkSession
spark = SparkSession.builder \
4 changes: 2 additions & 2 deletions code/Structured_APIs-Chapter_5_Basic_Structured_Operations.py
Original file line number Diff line number Diff line change
@@ -190,8 +190,8 @@
from pyspark.sql import Row
schema = df.schema
newRows = [
Row("New Country", "Other Country", 5L),
Row("New Country 2", "Other Country 3", 1L)
Row("New Country", "Other Country", 5),
Row("New Country 2", "Other Country 3", 1)
]
parallelizedRows = spark.sparkContext.parallelize(newRows)
newDF = spark.createDataFrame(parallelizedRows, schema)
4 changes: 2 additions & 2 deletions code/Structured_APIs-Chapter_9_Data_Sources.py
Original file line number Diff line number Diff line change
@@ -112,8 +112,8 @@
# COMMAND ----------

colName = "count"
lowerBound = 0L
upperBound = 348113L # this is the max count in our database
lowerBound = 0
upperBound = 348113 # this is the max count in our database
numPartitions = 10