#
import sqlalchemy as sqla
import pymysql
import pandas as pd
import os
= "" # get this from the instructor
username = "" # get this from the instructor
password = "" # get this from the instructor
awsresource = 3306 port
Databases and SQL - Selection - Python
Fundamentals of Data Science
Connecting to the database using sqlalchemy
Our first task is to make a connection to our database using python and the sqlalchemy package.
= "sakila"
dbname = f"mysql+pymysql://{username}:{password}@{awsresource}:{port}/{dbname}"
dburi = sqla.create_engine(dburi) engine
The following functions make it more convenient to run SQL commands.
def run_sql(sql, engine):
"send sql to database given by engine and return the result"
with engine.connect() as conn:
= conn.execute(sqla.text(sql))
result return result.all()
def df_select(sql, engine):
"send sql to database given by engine and return the result as a pandas dataframe"
with engine.connect() as conn:
= pd.read_sql(sqla.text(sql), con=conn)
result print(f"Retrieved {result.shape[0]} records")
return result
To test the connection, let’s use the show
command to list the tables in the sakila
database.
= run_sql("show tables;", engine)
tables for x in tables:
print(x)
('actor',)
('actor_info',)
('address',)
('category',)
('city',)
('country',)
('customer',)
('customer_list',)
('film',)
('film_actor',)
('film_category',)
('film_list',)
('film_text',)
('inventory',)
('language',)
('nicer_but_slower_film_list',)
('payment',)
('rental',)
('sales_by_film_category',)
('sales_by_store',)
('staff',)
('staff_list',)
('store',)
Now let’s take a closer look at one of these tables using the describe
command. The most important info we learn here are the names of the fields in the actor
table, and their datatypes.
= run_sql("describe actor;", engine)
actor_info for x in actor_info[:10]:
print(x)
('actor_id', 'smallint unsigned', 'NO', 'PRI', None, 'auto_increment')
('first_name', 'varchar(45)', 'NO', '', None, '')
('last_name', 'varchar(45)', 'NO', 'MUL', None, '')
('last_update', 'timestamp', 'NO', '', 'CURRENT_TIMESTAMP', 'DEFAULT_GENERATED on update CURRENT_TIMESTAMP')
Select and select where
The fundamental SQL command is select
. The syntax for the select
command is
select field, field, field from table;
In our case, let’s select some information from the actor
table.
= run_sql("select actor_id, first_name, last_name from actor;", engine)
actor_data for x in actor_data[:10]:
print(x) # result is a list of tuples
(1, 'PENELOPE', 'GUINESS')
(2, 'NICK', 'WAHLBERG')
(3, 'ED', 'CHASE')
(4, 'JENNIFER', 'DAVIS')
(5, 'JOHNNY', 'LOLLOBRIGIDA')
(6, 'BETTE', 'NICHOLSON')
(7, 'GRACE', 'MOSTEL')
(8, 'MATTHEW', 'JOHANSSON')
(9, 'JOE', 'SWANK')
(10, 'CHRISTIAN', 'GABLE')
It’s a lot more convenient to get this in the form of a pandas dataframe. One could convert it “by hand”, but pandas actually gives us a direct method for this. It needs a connection as an argument.
The function sqla.text
turns an SQL string into the appropriate format for passing to the pandas function.
with engine.connect() as conn:
= pd.read_sql(
actor_df "SELECT actor_id, first_name, last_name FROM actor;"), conn
sqla.text(
) actor_df.head()
actor_id | first_name | last_name | |
---|---|---|---|
0 | 1 | PENELOPE | GUINESS |
1 | 2 | NICK | WAHLBERG |
2 | 3 | ED | CHASE |
3 | 4 | JENNIFER | DAVIS |
4 | 5 | JOHNNY | LOLLOBRIGIDA |
For another example, let’s look at the film
table.
= run_sql("describe film;", engine)
film_info for x in film_info:
print(x)
('film_id', 'smallint unsigned', 'NO', 'PRI', None, 'auto_increment')
('title', 'varchar(128)', 'NO', 'MUL', None, '')
('description', 'text', 'YES', '', None, '')
('release_year', 'year', 'YES', '', None, '')
('language_id', 'tinyint unsigned', 'NO', 'MUL', None, '')
('original_language_id', 'tinyint unsigned', 'YES', 'MUL', None, '')
('rental_duration', 'tinyint unsigned', 'NO', '', '3', '')
('rental_rate', 'decimal(4,2)', 'NO', '', '4.99', '')
('length', 'smallint unsigned', 'YES', '', None, '')
('replacement_cost', 'decimal(5,2)', 'NO', '', '19.99', '')
('rating', "enum('G','PG','PG-13','R','NC-17')", 'YES', '', 'G', '')
('special_features', "set('Trailers','Commentaries','Deleted Scenes','Behind the Scenes')", 'YES', '', None, '')
('last_update', 'timestamp', 'NO', '', 'CURRENT_TIMESTAMP', 'DEFAULT_GENERATED on update CURRENT_TIMESTAMP')
Using what we already know, the simplest way to work with these tables is to move them into a pandas dataframe locally and work there. For example, suppose we want to look at the film ratings.
with engine.connect() as conn:
= pd.read_sql(sqla.text("select film_id, title, rating from film;"), conn)
film_df "rating"].value_counts() film_df[
rating
PG-13 223
NC-17 210
R 195
PG 194
G 178
Name: count, dtype: int64
And, of course, if we want to pick out the ‘G’-rated films:
= film_df[film_df["rating"] == "G"]
GFilms GFilms.shape
(178, 3)
But suppose there are a huge number of films, and we don’t want to download all of them just to throw away the ones that aren’t G-rated. Then we can use a where
clause.
In this example we also use the film_id
as the table index.
with engine.connect() as conn:
= pd.read_sql(
g_film_df "select film_id, title, rating from film where rating='G'"),
sqla.text(
conn,="film_id",
index_col
)print(f"retrieved {g_film_df.shape[0]} records")
retrieved 178 records
What about films between 90 minutes and two hours? And what if we want the result sorted by length?
with engine.connect() as conn:
= pd.read_sql(
midlength_df
sqla.text("select film_id, title, length from film where length>=90 and length<=120 order by length;"
),
conn,="film_id",
index_col
)print(f"retrieved {midlength_df.shape[0]} records")
retrieved 223 records
For simplicity we can use the df_select function.
= df_select(
midlength_df "Select film_id, title, length from film where length>=90 and length<=120 order by length desc;",
engine, )
Retrieved 223 records
So to summarize, we have
select f1, f2, ... fn from table where condition order by field;
select f1, f2, ... fn from table where condition order by field desc;
Grouping and Summarizing
The grouping and summarizing operations in pandas are modeled on those from SQL. The group by
clause in SQL does the grouping, and then one can apply summarizing functions to fields. For example, suppose we want to count the number of movies with each rating.
We say:
select count(rating) from film group by rating;
"select rating, count(rating) from film group by rating;", engine) run_sql(
[('PG', 194), ('G', 178), ('NC-17', 210), ('PG-13', 223), ('R', 195)]
We have the usual summary functions (count
, avg
, min
, max
,…)
run_sql("select rating, min(length), avg(length), stddev(length), max(length) from film group by rating;",
engine, )
[('PG', 46, Decimal('112.0052'), 39.15571575801518, 185),
('G', 47, Decimal('111.0506'), 41.65431205970793, 185),
('NC-17', 46, Decimal('113.2286'), 40.717904601072505, 184),
('PG-13', 46, Decimal('120.4439'), 41.074090844322875, 185),
('R', 49, Decimal('118.6615'), 38.40817631559128, 185)]
You can of course put this in a dataframe.
df_select("select rating, min(length), avg(length), stddev(length), max(length) from film group by rating;",
engine, )
Retrieved 5 records
rating | min(length) | avg(length) | stddev(length) | max(length) | |
---|---|---|---|---|---|
0 | PG | 46 | 112.0052 | 39.155716 | 185 |
1 | G | 47 | 111.0506 | 41.654312 | 185 |
2 | NC-17 | 46 | 113.2286 | 40.717905 | 184 |
3 | PG-13 | 46 | 120.4439 | 41.074091 | 185 |
4 | R | 49 | 118.6615 | 38.408176 | 185 |
You can rename the columns inside the SQL query:
df_select("select rating, min(length) as minlen, avg(length) as avlen, stddev(length) as stdlen, max(length) as maxlen from film group by rating;",
engine, )
Retrieved 5 records
rating | minlen | avlen | stdlen | maxlen | |
---|---|---|---|---|---|
0 | PG | 46 | 112.0052 | 39.155716 | 185 |
1 | G | 47 | 111.0506 | 41.654312 | 185 |
2 | NC-17 | 46 | 113.2286 | 40.717905 | 184 |
3 | PG-13 | 46 | 120.4439 | 41.074091 | 185 |
4 | R | 49 | 118.6615 | 38.408176 | 185 |
And you can sort the result:
df_select("select rating, min(length) as minlen, avg(length) as avlen, stddev(length) as stdlen, max(length) as maxlen from film group by rating order by avlen;",
engine, )
Retrieved 5 records
rating | minlen | avlen | stdlen | maxlen | |
---|---|---|---|---|---|
0 | G | 47 | 111.0506 | 41.654312 | 185 |
1 | PG | 46 | 112.0052 | 39.155716 | 185 |
2 | NC-17 | 46 | 113.2286 | 40.717905 | 184 |
3 | R | 49 | 118.6615 | 38.408176 | 185 |
4 | PG-13 | 46 | 120.4439 | 41.074091 | 185 |
So to summarize we have:
select group_field, summary(f1) as name1, summary(f2) as name2,... from table
group by group_field order by name1
Two extras: distinct and limit
Finally, if you just want to see the possible values in a field, or you only want to consider distinct values in a sum, you can use the DISTINCT
keyword.
"Select distinct rating from film", engine) run_sql(
[('PG',), ('G',), ('NC-17',), ('PG-13',), ('R',)]
"select count(distinct rating) from film", engine) run_sql(
[(5,)]
If you only want to retrieve, say, 20 records, you can use the LIMIT
keyword.
"select title, length from film limit 20", engine) run_sql(
[('ACADEMY DINOSAUR', 86),
('ACE GOLDFINGER', 48),
('ADAPTATION HOLES', 50),
('AFFAIR PREJUDICE', 117),
('AFRICAN EGG', 130),
('AGENT TRUMAN', 169),
('AIRPLANE SIERRA', 62),
('AIRPORT POLLOCK', 54),
('ALABAMA DEVIL', 114),
('ALADDIN CALENDAR', 63),
('ALAMO VIDEOTAPE', 126),
('ALASKA PHANTOM', 136),
('ALI FOREVER', 150),
('ALICE FANTASIA', 94),
('ALIEN CENTER', 46),
('ALLEY EVOLUTION', 180),
('ALONE TRIP', 82),
('ALTER VICTORY', 57),
('AMADEUS HOLY', 113),
('AMELIE HELLFIGHTERS', 79)]