portfolio-post

In [2]:
import sqlalchemy as db
import pandas as pd
import numpy as np
import pymysql

engine = db.create_engine("mysql+pymysql://root:password@localhost/sakila?host=localhost?port=3306")
connection = engine.connect()
metadata = db.MetaData()
In [6]:
actor = db.Table('actor', metadata, autoload=True, autoload_with=engine)
In [7]:
# Print the column names
print(actor.columns.keys())
['actor_id', 'first_name', 'last_name', 'last_update']
In [8]:
# Print full table metadata
print(repr(metadata.tables['actor']))
Table('actor', MetaData(bind=None), Column('actor_id', SMALLINT(display_width=5, unsigned=True), table=<actor>, primary_key=True, nullable=False), Column('first_name', VARCHAR(length=45), table=<actor>, nullable=False), Column('last_name', VARCHAR(length=45), table=<actor>, nullable=False), Column('last_update', TIMESTAMP(), table=<actor>, nullable=False, server_default=DefaultClause(<sqlalchemy.sql.elements.TextClause object at 0x000000000F100550>, for_update=False)), schema=None)

Querying

Table and MetaData have already been imported. The metadata is available as metadata.

In [10]:
#Equivalent to 'SELECT * FROM census'
query = db.select([actor])
ResultProxy = connection.execute(query)
ResultSet = ResultProxy.fetchall()

# Limit 3
ResultSet[:3]
Out[10]:
[(1, 'PENELOPE', 'GUINESS', datetime.datetime(2006, 2, 15, 4, 34, 33)),
 (2, 'NICK', 'WAHLBERG', datetime.datetime(2006, 2, 15, 4, 34, 33)),
 (3, 'ED', 'CHASE', datetime.datetime(2006, 2, 15, 4, 34, 33))]

ResultProxy: The object returned by the .execute() method. It can be used in a variety of ways to get the data returned by the query.

ResultSet: The actual data asked for in the query when using a fetch method such as .fetchall() on a ResultProxy.

Dealing with Large ResultSet

We use .fetchmany() to load optimal no of rows and overcome memory issues in case of large datasets

In [ ]:
while flag:
    partial_results = ResultProxy.fetchmany(50)
    if(partial_results == []): 
	flag = False
    //
	code
   //
ResultProxy.close()
In [11]:
#Convert to dataframe
df = pd.DataFrame(ResultSet)
df.columns = ResultSet[0].keys()

Filtering data

Lets see some examples of raw SQLite Queries and queries using SQLAlchemy.

where

SQL :

SELECT * FROM city WHERE country_id = 6

In [17]:
#SQLAlchemy :

engine = db.create_engine("mysql+pymysql://root:password@localhost/sakila?host=localhost?port=3306")
connection = engine.connect()
metadata = db.MetaData()
city = db.Table('city', metadata, autoload=True, autoload_with=engine)
query = db.select([city]).where(city.columns.country_id == 6)
ResultProxy = connection.execute(query)
ResultSet = ResultProxy.fetchall()
ResultSet
Out[17]:
[(20, 'Almirante Brown', 6, datetime.datetime(2006, 2, 15, 4, 45, 25)),
 (43, 'Avellaneda', 6, datetime.datetime(2006, 2, 15, 4, 45, 25)),
 (45, 'Baha Blanca', 6, datetime.datetime(2006, 2, 15, 4, 45, 25)),
 (128, 'Crdoba', 6, datetime.datetime(2006, 2, 15, 4, 45, 25)),
 (161, 'Escobar', 6, datetime.datetime(2006, 2, 15, 4, 45, 25)),
 (165, 'Ezeiza', 6, datetime.datetime(2006, 2, 15, 4, 45, 25)),
 (289, 'La Plata', 6, datetime.datetime(2006, 2, 15, 4, 45, 25)),
 (334, 'Merlo', 6, datetime.datetime(2006, 2, 15, 4, 45, 25)),
 (424, 'Quilmes', 6, datetime.datetime(2006, 2, 15, 4, 45, 25)),
 (454, 'San Miguel de Tucumn', 6, datetime.datetime(2006, 2, 15, 4, 45, 25)),
 (457, 'Santa F', 6, datetime.datetime(2006, 2, 15, 4, 45, 25)),
 (524, 'Tandil', 6, datetime.datetime(2006, 2, 15, 4, 45, 25)),
 (567, 'Vicente Lpez', 6, datetime.datetime(2006, 2, 15, 4, 45, 25))]

in

SQL :

SELECT name, city FROM customer_list WHERE country IN (Argentina, Austria)

In [18]:
#SQLAlchemy :
engine = db.create_engine("mysql+pymysql://root:password@localhost/sakila?host=localhost?port=3306")
connection = engine.connect()
metadata = db.MetaData()
customer_list = db.Table('customer_list', metadata, autoload=True, autoload_with=engine)

query = db.select([customer_list.columns.name, customer_list.columns.city]).where(customer_list.columns.country.in_(['Argentina', 'Austria']))
ResultProxy = connection.execute(query)
ResultSet = ResultProxy.fetchall()
ResultSet
Out[18]:
[('WILLIE MARKHAM', 'Almirante Brown'),
 ('JORDAN ARCHULETA', 'Avellaneda'),
 ('JASON MORRISSEY', 'Baha Blanca'),
 ('KIMBERLY LEE', 'Crdoba'),
 ('MICHEAL FORMAN', 'Escobar'),
 ('DARRYL ASHCRAFT', 'Ezeiza'),
 ('JULIA FLORES', 'La Plata'),
 ('FLORENCE WOODS', 'Merlo'),
 ('PERRY SWAFFORD', 'Quilmes'),
 ('LYDIA BURKE', 'San Miguel de Tucumn'),
 ('ERIC ROBERT', 'Santa F'),
 ('LEONARD SCHOFIELD', 'Tandil'),
 ('WILLIE HOWELL', 'Vicente Lpez'),
 ('AUDREY RAY', 'Graz'),
 ('JILL HAWKINS', 'Linz'),
 ('NORA HERRERA', 'Salzburg')]

and, or, not

SQL : SELECT * FROM customer_list WHERE country = 'Argentina' AND NOT country = 'Austria'

In [21]:
#SQLAlchemy :
engine = db.create_engine("mysql+pymysql://root:password@localhost/sakila?host=localhost?port=3306")
connection = engine.connect()
metadata = db.MetaData()
customer_list = db.Table('customer_list', metadata, autoload=True, autoload_with=engine)

query = db.select([customer_list]).where(db.and_(customer_list.columns.country == 'Argentina', customer_list.columns.country != 'Austria'))
ResultProxy = connection.execute(query)
ResultSet = ResultProxy.fetchall()
ResultSet
Out[21]:
[(359, 'WILLIE MARKHAM', '1623 Kingstown Drive', '91299', '296394569728', 'Almirante Brown', 'Argentina', 'active', 2),
 (560, 'JORDAN ARCHULETA', '1229 Varanasi (Benares) Manor', '40195', '817740355461', 'Avellaneda', 'Argentina', 'active', 1),
 (322, 'JASON MORRISSEY', '1427 A Corua (La Corua) Place', '85799', '972574862516', 'Baha Blanca', 'Argentina', 'active', 1),
 (24, 'KIMBERLY LEE', '96 Tafuna Way', '99865', '934730187245', 'Crdoba', 'Argentina', 'active', 2),
 (445, 'MICHEAL FORMAN', '203 Tambaram Street', '73942', '411549550611', 'Escobar', 'Argentina', 'active', 1),
 (530, 'DARRYL ASHCRAFT', '166 Jinchang Street', '86760', '717566026669', 'Ezeiza', 'Argentina', 'active', 2),
 (89, 'JULIA FLORES', '1926 El Alto Avenue', '75543', '846225459260', 'La Plata', 'Argentina', 'active', 1),
 (107, 'FLORENCE WOODS', '1532 Dzerzinsk Way', '9599', '330838016880', 'Merlo', 'Argentina', 'active', 1),
 (585, 'PERRY SWAFFORD', '773 Dallas Manor', '12664', '914466027044', 'Quilmes', 'Argentina', 'active', 1),
 (243, 'LYDIA BURKE', '1483 Pathankot Street', '37288', '686015532180', 'San Miguel de Tucumn', 'Argentina', 'active', 1),
 (331, 'ERIC ROBERT', '430 Kumbakonam Drive', '28814', '105470691550', 'Santa F', 'Argentina', 'active', 1),
 (405, 'LEONARD SCHOFIELD', '88 Nagaon Manor', '86868', '779461480495', 'Tandil', 'Argentina', 'active', 1),
 (219, 'WILLIE HOWELL', '1244 Allappuzha (Alleppey) Place', '20657', '991802825778', 'Vicente Lpez', 'Argentina', 'active', 2)]

order by

SQL : SELECT * FROM payment ORDER BY amount DESC, amount LIMIT 3

In [27]:
#SQLAlchemy :
engine = db.create_engine("mysql+pymysql://root:password@localhost/sakila?host=localhost?port=3306")
connection = engine.connect()
metadata = db.MetaData()
payment = db.Table('payment', metadata, autoload=True, autoload_with=engine)

query = db.select([payment]).order_by(db.desc(payment.columns.amount), payment.columns.amount)

ResultProxy = connection.execute(query)
ResultSet = ResultProxy.fetchall()
ResultSet[:3]
Out[27]:
[(8272, 305, 1, 2166, Decimal('11.99'), datetime.datetime(2005, 6, 17, 23, 51, 21), datetime.datetime(2006, 2, 15, 22, 15, 47)),
 (9803, 362, 1, 14759, Decimal('11.99'), datetime.datetime(2005, 8, 21, 23, 28, 58), datetime.datetime(2006, 2, 15, 22, 16, 57)),
 (15821, 591, 2, 4383, Decimal('11.99'), datetime.datetime(2005, 7, 7, 20, 45, 51), datetime.datetime(2006, 2, 15, 22, 23, 18))]

functions

SQL : SELECT SUM(amount) FROM payment

In [29]:
#SQLAlchemy :
engine = db.create_engine("mysql+pymysql://root:password@localhost/sakila?host=localhost?port=3306")
connection = engine.connect()
metadata = db.MetaData()
payment = db.Table('payment', metadata, autoload=True, autoload_with=engine)

query = db.select([db.func.sum(payment.columns.amount)])

ResultProxy = connection.execute(query)
ResultSet = ResultProxy.fetchall()
ResultSet
Out[29]:
[(Decimal('67416.51'),)]

other functions include avg, count, min, max…

group by

SQL :

SELECT SUM(pop2008) as pop2008, sex FROM census

In [30]:
#SQLAlchemy :
engine = db.create_engine("mysql+pymysql://root:password@localhost/sakila?host=localhost?port=3306")
connection = engine.connect()
metadata = db.MetaData()
payment = db.Table('payment', metadata, autoload=True, autoload_with=engine)

query = db.select([db.func.sum(payment.columns.amount).label('Total Amount'), payment.columns.amount]).group_by(payment.columns.amount)

ResultProxy = connection.execute(query)
ResultSet = ResultProxy.fetchall()
ResultSet
Out[30]:
[(Decimal('0.00'), Decimal('0.00')),
 (Decimal('2949.21'), Decimal('0.99')),
 (Decimal('1.98'), Decimal('1.98')),
 (Decimal('1273.60'), Decimal('1.99')),
 (Decimal('10590.58'), Decimal('2.99')),
 (Decimal('31.84'), Decimal('3.98')),
 (Decimal('4424.91'), Decimal('3.99')),
 (Decimal('18907.11'), Decimal('4.99')),
 (Decimal('41.86'), Decimal('5.98')),
 (Decimal('7781.01'), Decimal('5.99')),
 (Decimal('7821.81'), Decimal('6.99')),
 (Decimal('39.90'), Decimal('7.98')),
 (Decimal('5353.30'), Decimal('7.99')),
 (Decimal('8.97'), Decimal('8.97')),
 (Decimal('4360.15'), Decimal('8.99')),
 (Decimal('9.98'), Decimal('9.98')),
 (Decimal('2557.44'), Decimal('9.99')),
 (Decimal('1142.96'), Decimal('10.99')),
 (Decimal('119.90'), Decimal('11.99'))]

SQLAlchemy :

db.select([db.func.sum(census.columns.pop2008).label('pop2008'), census.columns.sex]).group_by(census.columns.sex) distinct

SQL : SELECT DISTINCT state FROM census

SQLAlchemy :

db.select([census.columns.state.distinct()]) case & cast

The case() expression accepts a list of conditions to match and the column to return if the condition matches, followed by an else_ if none of the conditions match.

cast() function to convert an expression to a particular type

example

femalepop = db.func.sum(db.case([(census.columns.sex == 'F', census.columns.pop2000)],else=0)) total_pop = db.cast(db.func.sum(census.columns.pop2000), db.Float) query = db.select([female_pop/total_pop * 100]) result = connection.execute(query).scalar() print(result)

joins

If you have two tables that already have an established relationship, you can automatically use that relationship by just adding the columns we want from each table to the select statement.

select([census.columns.pop2008, state_fact.columns.abbreviation])

example

census = db.Table('census', metadata, autoload=True, autoload_with=engine) state_fact = db.Table('state_fact', metadata, autoload=True, autoload_with=engine)

Automatic Join

query = db.select([census.columns.pop2008, state_fact.columns.abbreviation]) result = connection.execute(query).fetchall() df = pd.DataFrame(results) df.columns = results[0].keys() df.head(5)

Manual Join query = db.select([census, state_fact]) query = query.select_from(census.join(state_fact, census.columns.state == state_fact.columns.name)) results = connection.execute(query).fetchall() df = pd.DataFrame(results) df.columns = results[0].keys() df.head(5)

Creating and Inserting Data into Tables By passing the database which is not present, to the engine then sqlalchemy automatically creates a new database.

The case() expression accepts a list of conditions to match and the column to return if the condition matches, followed by an else_ if none of the conditions match.

cast() function to convert an expression to a particular type

example

Creating Database and Table

In [4]:
#SQLAlchemy :
engine = db.create_engine("mysql+pymysql://root:password@localhost/sakila?host=localhost?port=3306")
connection = engine.connect()
metadata = db.MetaData()

emp = db.Table('emp', metadata,
              db.Column('Id', db.Integer()),
              db.Column('name', db.String(255), nullable=False),
              db.Column('salary', db.Float(), default=100.0),
              db.Column('active', db.Boolean(), default=True)
              )

metadata.create_all(engine) #Creates the table

Inserting Data

In [32]:
#Inserting record one by one
query = db.insert(emp).values(Id=1, name='naveen', salary=60000.00, active=True) 
ResultProxy = connection.execute(query)
In [33]:
#Inserting many records at ones
query = db.insert(emp) 
values_list = [{'Id':'2', 'name':'ram', 'salary':80000, 'active':False},
               {'Id':'3', 'name':'ramesh', 'salary':70000, 'active':True}]
ResultProxy = connection.execute(query,values_list)
In [34]:
results = connection.execute(db.select([emp])).fetchall()
df = pd.DataFrame(results)
df.columns = results[0].keys()
df.head(4)
Out[34]:
Id name salary active
0 1 naveen 60000.0 True
1 2 ram 80000.0 False
2 3 ramesh 70000.0 True

Updating data in Databases

In [35]:
# Build a statement to update the salary to 100000
query = db.update(emp).values(salary = 100000)
query = query.where(emp.columns.Id == 1)
results = connection.execute(query)
In [36]:
results = connection.execute(db.select([emp])).fetchall()
df = pd.DataFrame(results)
df.columns = results[0].keys()
df.head(4)
Out[36]:
Id name salary active
0 1 naveen 100000.0 True
1 2 ram 80000.0 False
2 3 ramesh 70000.0 True

Delete Records

In [37]:
# Build a statement to delete where salary < 100000
query = db.delete(emp)
query = query.where(emp.columns.salary < 100000)
results = connection.execute(query)
In [38]:
results = connection.execute(db.select([emp])).fetchall()
df = pd.DataFrame(results)
df.columns = results[0].keys()
df.head(4)
Out[38]:
Id name salary active
0 1 naveen 100000.0 True

Dropping a Table

In [6]:
engine = db.create_engine("mysql+pymysql://root:password@localhost/sakila?host=localhost?port=3306")
connection = engine.connect()
metadata = db.MetaData()
emp = db.Table('emp', metadata, autoload=True, autoload_with=engine)

print(emp.columns.keys())

#emp.drop(engine) #drops a single table
['Id', 'name', 'salary', 'active']
In [7]:
emp.drop(engine) #drops a single table
In [ ]:
 
In [ ]:
table_name.drop(engine) #drops a single table
metadata.drop_all(engine) #drops all the tables in the database