portfolio-post

In [4]:
filamentData = [['filamentA','100W',605],
['filamentB','100W',683],
['filamentB','100W',691],
['filamentB','200W',561],
['filamentA','200W',530],
['filamentA','100W',619],
['filamentB','100W',686],
['filamentB','200W',600],
['filamentB','100W',696],
['filamentA','200W',579],
['filamentA','200W',520],
['filamentA','100W',622],
['filamentA','100W',668],
['filamentB','200W',569],
['filamentB','200W',555],
['filamentA','200W',541]]
In [5]:
filamentDataRDD = sc.parallelize(filamentData, 4)
filamentDataRDD.take(4)
Out[5]:
[['filamentA', '100W', 605],
 ['filamentB', '100W', 683],
 ['filamentB', '100W', 691],
 ['filamentB', '200W', 561]]
In [6]:
# Creating a Schema of a DataFrame
from pyspark.sql.types import *

FilamentTypeColumn = StructField("FilamentType", StringType(), True)
BulbPowerColumn = StructField("BulbPower", StringType(), True)
LifeInHoursColumn = StructField("LifeInHours", StringType(), True)
FilamentDataFrameSchema = StructType([FilamentTypeColumn, BulbPowerColumn, LifeInHoursColumn])
FilamentDataFrameSchema
Out[6]:
StructType(List(StructField(FilamentType,StringType,true),StructField(BulbPower,StringType,true),StructField(LifeInHours,StringType,true)))
In [7]:
# Creating an RDD of Row Objects
from pyspark.sql import Row

filamentRDDofRows = filamentDataRDD.map(lambda x : Row(str(x[0]), str(x[1]), str(x[2])) )
filamentRDDofRows.take(4)
Out[7]:
[<Row(filamentA, 100W, 605)>,
 <Row(filamentB, 100W, 683)>,
 <Row(filamentB, 100W, 691)>,
 <Row(filamentB, 200W, 561)>]
In [8]:
# Creating a DataFrame
from pyspark.sql import SQLContext

sqlContext = SQLContext(sc)
filamentDataFrameRaw = sqlContext.createDataFrame(filamentRDDofRows, FilamentDataFrameSchema)
filamentDataFrameRaw.take(4)
Out[8]:
[Row(FilamentType=u'filamentA', BulbPower=u'100W', LifeInHours=u'605'),
 Row(FilamentType=u'filamentB', BulbPower=u'100W', LifeInHours=u'683'),
 Row(FilamentType=u'filamentB', BulbPower=u'100W', LifeInHours=u'691'),
 Row(FilamentType=u'filamentB', BulbPower=u'200W', LifeInHours=u'561')]
In [9]:
#Printing a Schema of a DataFrame 
filamentDataFrameRaw.printSchema()
root
 |-- FilamentType: string (nullable = true)
 |-- BulbPower: string (nullable = true)
 |-- LifeInHours: string (nullable = true)

In [10]:
# Changing the Data Type of a Column
filamentDataFrame = filamentDataFrameRaw.withColumn('LifeInHours', filamentDataFrameRaw.LifeInHours.cast(FloatType()))
filamentDataFrame.printSchema()
root
 |-- FilamentType: string (nullable = true)
 |-- BulbPower: string (nullable = true)
 |-- LifeInHours: float (nullable = true)

In [11]:
filamentDataFrame.show(5)
+------------+---------+-----------+
|FilamentType|BulbPower|LifeInHours|
+------------+---------+-----------+
|   filamentA|     100W|      605.0|
|   filamentB|     100W|      683.0|
|   filamentB|     100W|      691.0|
|   filamentB|     200W|      561.0|
|   filamentA|     200W|      530.0|
+------------+---------+-----------+
only showing top 5 rows

In [154]:
dfdf = filamentDataFrame.describe()
dfdf.show()
+-------+
|summary|
+-------+
|  count|
|   mean|
| stddev|
|    min|
|    max|
+-------+

In [12]:
filamentDataFrame.columns
Out[12]:
['FilamentType', 'BulbPower', 'LifeInHours']
In [13]:
# Filtering Out Data Where BulbPower Is 100W
filamentDataFrame100Watt = filamentDataFrame.filter(filamentDataFrame.BulbPower == '100W')
filamentDataFrame100Watt.show()
+------------+---------+-----------+
|FilamentType|BulbPower|LifeInHours|
+------------+---------+-----------+
|   filamentA|     100W|      605.0|
|   filamentB|     100W|      683.0|
|   filamentB|     100W|      691.0|
|   filamentA|     100W|      619.0|
|   filamentB|     100W|      686.0|
|   filamentB|     100W|      696.0|
|   filamentA|     100W|      622.0|
|   filamentA|     100W|      668.0|
+------------+---------+-----------+

In [14]:
# Selecting Data from a DataFrame
filamentData100WGreater650 = filamentDataFrame.filter((filamentDataFrame.BulbPower == '100W') & (filamentDataFrame.LifeInHours > 650.0))
filamentData100WGreater650.show()
+------------+---------+-----------+
|FilamentType|BulbPower|LifeInHours|
+------------+---------+-----------+
|   filamentB|     100W|      683.0|
|   filamentB|     100W|      691.0|
|   filamentB|     100W|      686.0|
|   filamentB|     100W|      696.0|
|   filamentA|     100W|      668.0|
+------------+---------+-----------+

In [142]:
# Perform Exploratory Data Analysis on a DataFrame

#Defining the DataFrame Schema
from pyspark.sql.types import *

FilamentTypeColumn = StructField("FilamentType", StringType(), True)
BulbPowerColumn = StructField("BulbPower", StringType(), True)
LifeInHoursColumn = StructField("LifeInHours", StringType(), True)
FilamentDataFrameSchema = StructType([FilamentTypeColumn, BulbPowerColumn, LifeInHoursColumn])
In [144]:
df = sqlContext.read.load('file:///Users//KEVIN//Downloads//WorkArea//Python//pyspark//datafiles//filamentDataList.csv', 
                          format='com.databricks.spark.csv', 
                          header='true', 
                          schema = FilamentDataFrameSchema)
df.show(5)
+-------------+---------+-----------+
| FilamentType|BulbPower|LifeInHours|
+-------------+---------+-----------+
| ['filamentB'|   '100W'|       683]|
| ['filamentB'|   '100W'|       691]|
| ['filamentB'|   '200W'|       561]|
| ['filamentA'|   '200W'|       530]|
| ['filamentA'|   '100W'|       619]|
+-------------+---------+-----------+
only showing top 5 rows

In [151]:
filamentDataFrame = sqlContext.read.format("org.apache.spark.sql.execution.datasources.csv.CSVFileFormat").option("header","true").schema(FilamentDataFrameSchema).load('file:///Users//KEVIN//Downloads//WorkArea//Python//pyspark//datafiles//filamentDataList.csv')
filamentDataFrame.show(5)
+-------------+---------+-----------+
| FilamentType|BulbPower|LifeInHours|
+-------------+---------+-----------+
| ['filamentB'|   '100W'|       683]|
| ['filamentB'|   '100W'|       691]|
| ['filamentB'|   '200W'|       561]|
| ['filamentA'|   '200W'|       530]|
| ['filamentA'|   '100W'|       619]|
+-------------+---------+-----------+
only showing top 5 rows

In [136]:
filamentDataFrame.printSchema()
root
 |-- FilamentType: string (nullable = true)
 |-- BulbPower: string (nullable = true)
 |-- LifeInHours: string (nullable = true)

In [137]:
# Calculating Summary Statistics
dataSummary = filamentDataFrame.describe()
dataSummary.show()
+-------+
|summary|
+-------+
|  count|
|   mean|
| stddev|
|    min|
|    max|
+-------+

In [138]:
df = sqlContext.read.load('file:///Users//KEVIN//Downloads//WorkArea//Python//pyspark//datafiles//filamentDataList.csv', 
                          format='com.databricks.spark.csv', 
                          header=True, 
                          inferSchema=False,
                         schema = FilamentDataFrameSchema)
df.show(5)
+-------------+---------+-----------+
| FilamentType|BulbPower|LifeInHours|
+-------------+---------+-----------+
| ['filamentB'|   '100W'|       683]|
| ['filamentB'|   '100W'|       691]|
| ['filamentB'|   '200W'|       561]|
| ['filamentA'|   '200W'|       530]|
| ['filamentA'|   '100W'|       619]|
+-------------+---------+-----------+
only showing top 5 rows

In [139]:
df.count()
Out[139]:
15
In [140]:
df.printSchema()
root
 |-- FilamentType: string (nullable = true)
 |-- BulbPower: string (nullable = true)
 |-- LifeInHours: string (nullable = true)

In [141]:
df.describe().show()
+-------+
|summary|
+-------+
|  count|
|   mean|
| stddev|
|    min|
|    max|
+-------+

In [124]:
df.dtypes
Out[124]:
[('FilamentType', 'string'),
 ('BulbPower', 'string'),
 ('LifeInHours', 'string')]
In [125]:
df.describe().show()
+-------+
|summary|
+-------+
|  count|
|   mean|
| stddev|
|    min|
|    max|
+-------+