filamentData = [['filamentA','100W',605],
['filamentB','100W',683],
['filamentB','100W',691],
['filamentB','200W',561],
['filamentA','200W',530],
['filamentA','100W',619],
['filamentB','100W',686],
['filamentB','200W',600],
['filamentB','100W',696],
['filamentA','200W',579],
['filamentA','200W',520],
['filamentA','100W',622],
['filamentA','100W',668],
['filamentB','200W',569],
['filamentB','200W',555],
['filamentA','200W',541]]

filamentDataRDD = sc.parallelize(filamentData, 4)
filamentDataRDD.take(4)

[['filamentA', '100W', 605],
 ['filamentB', '100W', 683],
 ['filamentB', '100W', 691],
 ['filamentB', '200W', 561]]

# Creating a Schema of a DataFrame
from pyspark.sql.types import *

FilamentTypeColumn = StructField("FilamentType", StringType(), True)
BulbPowerColumn = StructField("BulbPower", StringType(), True)
LifeInHoursColumn = StructField("LifeInHours", StringType(), True)
FilamentDataFrameSchema = StructType([FilamentTypeColumn, BulbPowerColumn, LifeInHoursColumn])
FilamentDataFrameSchema

StructType(List(StructField(FilamentType,StringType,true),StructField(BulbPower,StringType,true),StructField(LifeInHours,StringType,true)))

# Creating an RDD of Row Objects
from pyspark.sql import Row

filamentRDDofRows = filamentDataRDD.map(lambda x : Row(str(x[0]), str(x[1]), str(x[2])) )
filamentRDDofRows.take(4)

[<Row(filamentA, 100W, 605)>,
 <Row(filamentB, 100W, 683)>,
 <Row(filamentB, 100W, 691)>,
 <Row(filamentB, 200W, 561)>]

# Creating a DataFrame
from pyspark.sql import SQLContext

sqlContext = SQLContext(sc)
filamentDataFrameRaw = sqlContext.createDataFrame(filamentRDDofRows, FilamentDataFrameSchema)
filamentDataFrameRaw.take(4)

[Row(FilamentType=u'filamentA', BulbPower=u'100W', LifeInHours=u'605'),
 Row(FilamentType=u'filamentB', BulbPower=u'100W', LifeInHours=u'683'),
 Row(FilamentType=u'filamentB', BulbPower=u'100W', LifeInHours=u'691'),
 Row(FilamentType=u'filamentB', BulbPower=u'200W', LifeInHours=u'561')]

#Printing a Schema of a DataFrame 
filamentDataFrameRaw.printSchema()

root
 |-- FilamentType: string (nullable = true)
 |-- BulbPower: string (nullable = true)
 |-- LifeInHours: string (nullable = true)

# Changing the Data Type of a Column
filamentDataFrame = filamentDataFrameRaw.withColumn('LifeInHours', filamentDataFrameRaw.LifeInHours.cast(FloatType()))
filamentDataFrame.printSchema()

root
 |-- FilamentType: string (nullable = true)
 |-- BulbPower: string (nullable = true)
 |-- LifeInHours: float (nullable = true)

filamentDataFrame.show(5)

+------------+---------+-----------+
|FilamentType|BulbPower|LifeInHours|
+------------+---------+-----------+
|   filamentA|     100W|      605.0|
|   filamentB|     100W|      683.0|
|   filamentB|     100W|      691.0|
|   filamentB|     200W|      561.0|
|   filamentA|     200W|      530.0|
+------------+---------+-----------+
only showing top 5 rows

dfdf = filamentDataFrame.describe()
dfdf.show()

+-------+
|summary|
+-------+
|  count|
|   mean|
| stddev|
|    min|
|    max|
+-------+

filamentDataFrame.columns

['FilamentType', 'BulbPower', 'LifeInHours']

# Filtering Out Data Where BulbPower Is 100W
filamentDataFrame100Watt = filamentDataFrame.filter(filamentDataFrame.BulbPower == '100W')
filamentDataFrame100Watt.show()

+------------+---------+-----------+
|FilamentType|BulbPower|LifeInHours|
+------------+---------+-----------+
|   filamentA|     100W|      605.0|
|   filamentB|     100W|      683.0|
|   filamentB|     100W|      691.0|
|   filamentA|     100W|      619.0|
|   filamentB|     100W|      686.0|
|   filamentB|     100W|      696.0|
|   filamentA|     100W|      622.0|
|   filamentA|     100W|      668.0|
+------------+---------+-----------+

# Selecting Data from a DataFrame
filamentData100WGreater650 = filamentDataFrame.filter((filamentDataFrame.BulbPower == '100W') & (filamentDataFrame.LifeInHours > 650.0))
filamentData100WGreater650.show()

+------------+---------+-----------+
|FilamentType|BulbPower|LifeInHours|
+------------+---------+-----------+
|   filamentB|     100W|      683.0|
|   filamentB|     100W|      691.0|
|   filamentB|     100W|      686.0|
|   filamentB|     100W|      696.0|
|   filamentA|     100W|      668.0|
+------------+---------+-----------+

# Perform Exploratory Data Analysis on a DataFrame

#Defining the DataFrame Schema
from pyspark.sql.types import *

FilamentTypeColumn = StructField("FilamentType", StringType(), True)
BulbPowerColumn = StructField("BulbPower", StringType(), True)
LifeInHoursColumn = StructField("LifeInHours", StringType(), True)
FilamentDataFrameSchema = StructType([FilamentTypeColumn, BulbPowerColumn, LifeInHoursColumn])

df = sqlContext.read.load('file:///Users//KEVIN//Downloads//WorkArea//Python//pyspark//datafiles//filamentDataList.csv', 
                          format='com.databricks.spark.csv', 
                          header='true', 
                          schema = FilamentDataFrameSchema)
df.show(5)

+-------------+---------+-----------+
| FilamentType|BulbPower|LifeInHours|
+-------------+---------+-----------+
| ['filamentB'|   '100W'|       683]|
| ['filamentB'|   '100W'|       691]|
| ['filamentB'|   '200W'|       561]|
| ['filamentA'|   '200W'|       530]|
| ['filamentA'|   '100W'|       619]|
+-------------+---------+-----------+
only showing top 5 rows

filamentDataFrame = sqlContext.read.format("org.apache.spark.sql.execution.datasources.csv.CSVFileFormat").option("header","true").schema(FilamentDataFrameSchema).load('file:///Users//KEVIN//Downloads//WorkArea//Python//pyspark//datafiles//filamentDataList.csv')
filamentDataFrame.show(5)

+-------------+---------+-----------+
| FilamentType|BulbPower|LifeInHours|
+-------------+---------+-----------+
| ['filamentB'|   '100W'|       683]|
| ['filamentB'|   '100W'|       691]|
| ['filamentB'|   '200W'|       561]|
| ['filamentA'|   '200W'|       530]|
| ['filamentA'|   '100W'|       619]|
+-------------+---------+-----------+
only showing top 5 rows

filamentDataFrame.printSchema()

root
 |-- FilamentType: string (nullable = true)
 |-- BulbPower: string (nullable = true)
 |-- LifeInHours: string (nullable = true)

# Calculating Summary Statistics
dataSummary = filamentDataFrame.describe()
dataSummary.show()

+-------+
|summary|
+-------+
|  count|
|   mean|
| stddev|
|    min|
|    max|
+-------+

df = sqlContext.read.load('file:///Users//KEVIN//Downloads//WorkArea//Python//pyspark//datafiles//filamentDataList.csv', 
                          format='com.databricks.spark.csv', 
                          header=True, 
                          inferSchema=False,
                         schema = FilamentDataFrameSchema)
df.show(5)

+-------------+---------+-----------+
| FilamentType|BulbPower|LifeInHours|
+-------------+---------+-----------+
| ['filamentB'|   '100W'|       683]|
| ['filamentB'|   '100W'|       691]|
| ['filamentB'|   '200W'|       561]|
| ['filamentA'|   '200W'|       530]|
| ['filamentA'|   '100W'|       619]|
+-------------+---------+-----------+
only showing top 5 rows

df.count()

15

df.printSchema()

root
 |-- FilamentType: string (nullable = true)
 |-- BulbPower: string (nullable = true)
 |-- LifeInHours: string (nullable = true)

df.describe().show()

+-------+
|summary|
+-------+
|  count|
|   mean|
| stddev|
|    min|
|    max|
+-------+

df.dtypes

[('FilamentType', 'string'),
 ('BulbPower', 'string'),
 ('LifeInHours', 'string')]

df.describe().show()

+-------+
|summary|
+-------+
|  count|
|   mean|
| stddev|
|    min|
|    max|
+-------+

HOA QUACH

HOA QUACH

May 10, 2017