# Always remember to install important packages!
import pandas as pd
import numpy as np
# ------ DO NOT REMOVE ------ #
= pd.DataFrame({
grades_df "Name": ["John", "Bob", "Charlie", "Diana", "Edward", "Fiona", "George",
"Hannah", "Ian", "Ian", "Liam", "Mia", "Noah", "Olivia", "Sophia"],
"Homework": [95.0, 82.5, 75.3, 91.2, np.nan, 88.6, 92.3, 85.7, 70.1, 70.1,
78.9, 94.2, np.nan, 86.7, 90.8],
"Exam Score": [85.5, "Not Graded", 65.3, 88.9, 72.2, 92.1, 94.1, 83, 70, 70, 79.7, 91.4, 80.1, 87.4, "Not Graded"],
"Office Hours": ["Yes", "No", "Yes", "Yes", "No", "No", "Yes", "Yes", "No", "No",
"Yes", "No", "Yes", "No", "Yes"],
"Final Grade": [90.5, 78.2, 65.8, 88.3, 72.4, 91.0, 94.3, 83.5, 68.6, 68.6,
79.7, 90.1, 80.5, 87.4, 82.3]
})
# ------ DO NOT REMOVE ------ #
Understanding the Data
# 1. Use the .info() function to identify data types and missing values
grades_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Name 15 non-null object
1 Homework 13 non-null float64
2 Exam Score 15 non-null object
3 Office Hours 15 non-null object
4 Final Grade 15 non-null float64
dtypes: float64(2), object(3)
memory usage: 732.0+ bytes
# 2. Return the dataset to observe its structure
grades_df
Name | Homework | Exam Score | Office Hours | Final Grade | |
---|---|---|---|---|---|
0 | John | 95.0 | 85.5 | Yes | 90.5 |
1 | Bob | 82.5 | Not Graded | No | 78.2 |
2 | Charlie | 75.3 | 65.3 | Yes | 65.8 |
3 | Diana | 91.2 | 88.9 | Yes | 88.3 |
4 | Edward | NaN | 72.2 | No | 72.4 |
5 | Fiona | 88.6 | 92.1 | No | 91.0 |
6 | George | 92.3 | 94.1 | Yes | 94.3 |
7 | Hannah | 85.7 | 83 | Yes | 83.5 |
8 | Ian | 70.1 | 70 | No | 68.6 |
9 | Ian | 70.1 | 70 | No | 68.6 |
10 | Liam | 78.9 | 79.7 | Yes | 79.7 |
11 | Mia | 94.2 | 91.4 | No | 90.1 |
12 | Noah | NaN | 80.1 | Yes | 80.5 |
13 | Olivia | 86.7 | 87.4 | No | 87.4 |
14 | Sophia | 90.8 | Not Graded | Yes | 82.3 |
Our data is a bit messy! We can see missing values in the “Homework” and “Exam Score” volumns as well as the categorical data in the “Office Hours” column.
Duplicate Values
# 3. Check how many duplicate rows are in the dataset (if any)
sum() grades_df.duplicated().
1
# 4. Drop duplicate rows (if any)
=True)
grades_df.drop_duplicates(inplace grades_df
Name | Homework | Exam Score | Office Hours | Final Grade | |
---|---|---|---|---|---|
0 | John | 95.0 | 85.5 | Yes | 90.5 |
1 | Bob | 82.5 | Not Graded | No | 78.2 |
2 | Charlie | 75.3 | 65.3 | Yes | 65.8 |
3 | Diana | 91.2 | 88.9 | Yes | 88.3 |
4 | Edward | NaN | 72.2 | No | 72.4 |
5 | Fiona | 88.6 | 92.1 | No | 91.0 |
6 | George | 92.3 | 94.1 | Yes | 94.3 |
7 | Hannah | 85.7 | 83 | Yes | 83.5 |
8 | Ian | 70.1 | 70 | No | 68.6 |
10 | Liam | 78.9 | 79.7 | Yes | 79.7 |
11 | Mia | 94.2 | 91.4 | No | 90.1 |
12 | Noah | NaN | 80.1 | Yes | 80.5 |
13 | Olivia | 86.7 | 87.4 | No | 87.4 |
14 | Sophia | 90.8 | Not Graded | Yes | 82.3 |
Missing Values
The missing values in the “Homework” column are represented by “NaN” which can be handled easier.
# 5. Replace missing values in the "Homework" column with the average
# homework score
"Homework"].fillna(grades_df["Homework"].mean().round(2),
grades_df[=True)
inplace grades_df
FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.
For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.
grades_df["Homework"].fillna(grades_df["Homework"].mean().round(2),
Name | Homework | Exam Score | Office Hours | Final Grade | |
---|---|---|---|---|---|
0 | John | 95.00 | 85.5 | Yes | 90.5 |
1 | Bob | 82.50 | Not Graded | No | 78.2 |
2 | Charlie | 75.30 | 65.3 | Yes | 65.8 |
3 | Diana | 91.20 | 88.9 | Yes | 88.3 |
4 | Edward | 85.94 | 72.2 | No | 72.4 |
5 | Fiona | 88.60 | 92.1 | No | 91.0 |
6 | George | 92.30 | 94.1 | Yes | 94.3 |
7 | Hannah | 85.70 | 83 | Yes | 83.5 |
8 | Ian | 70.10 | 70 | No | 68.6 |
10 | Liam | 78.90 | 79.7 | Yes | 79.7 |
11 | Mia | 94.20 | 91.4 | No | 90.1 |
12 | Noah | 85.94 | 80.1 | Yes | 80.5 |
13 | Olivia | 86.70 | 87.4 | No | 87.4 |
14 | Sophia | 90.80 | Not Graded | Yes | 82.3 |
Next, we must also handle the missing values in the “Exam Score” column. Since it is not in the form of “None” or “NaN”, we will need to handle it in a different way.
# 6. Replace missing values in the "Exam Score" column with the average
# exam score
"Exam Score"] = grades_df["Exam Score"].replace("Not Graded", np.nan)
grades_df[
"Exam Score"].mean(), inplace=True)
grades_df.fillna(grades_df[
grades_df
Name | Homework | Exam Score | Office Hours | Final Grade | |
---|---|---|---|---|---|
0 | John | 95.00 | 85.500 | Yes | 90.5 |
1 | Bob | 82.50 | 82.475 | No | 78.2 |
2 | Charlie | 75.30 | 65.300 | Yes | 65.8 |
3 | Diana | 91.20 | 88.900 | Yes | 88.3 |
4 | Edward | 85.94 | 72.200 | No | 72.4 |
5 | Fiona | 88.60 | 92.100 | No | 91.0 |
6 | George | 92.30 | 94.100 | Yes | 94.3 |
7 | Hannah | 85.70 | 83.000 | Yes | 83.5 |
8 | Ian | 70.10 | 70.000 | No | 68.6 |
10 | Liam | 78.90 | 79.700 | Yes | 79.7 |
11 | Mia | 94.20 | 91.400 | No | 90.1 |
12 | Noah | 85.94 | 80.100 | Yes | 80.5 |
13 | Olivia | 86.70 | 87.400 | No | 87.4 |
14 | Sophia | 90.80 | 82.475 | Yes | 82.3 |
Factoring Categorical Data
# 7. Turn the categorical data in the "Office Hours" columns to numerical
"Office Hours"] = np.where(grades_df["Office Hours"] == "Yes", 1, 0)
grades_df[ grades_df
Name | Homework | Exam Score | Office Hours | Final Grade | |
---|---|---|---|---|---|
0 | John | 95.00 | 85.500 | 1 | 90.5 |
1 | Bob | 82.50 | 82.475 | 0 | 78.2 |
2 | Charlie | 75.30 | 65.300 | 1 | 65.8 |
3 | Diana | 91.20 | 88.900 | 1 | 88.3 |
4 | Edward | 85.94 | 72.200 | 0 | 72.4 |
5 | Fiona | 88.60 | 92.100 | 0 | 91.0 |
6 | George | 92.30 | 94.100 | 1 | 94.3 |
7 | Hannah | 85.70 | 83.000 | 1 | 83.5 |
8 | Ian | 70.10 | 70.000 | 0 | 68.6 |
10 | Liam | 78.90 | 79.700 | 1 | 79.7 |
11 | Mia | 94.20 | 91.400 | 0 | 90.1 |
12 | Noah | 85.94 | 80.100 | 1 | 80.5 |
13 | Olivia | 86.70 | 87.400 | 0 | 87.4 |
14 | Sophia | 90.80 | 82.475 | 1 | 82.3 |
Removing Columns
# 8. Remove the "Name" column from our dataset
="Name", inplace=True)
grades_df.drop(columns grades_df
Homework | Exam Score | Office Hours | Final Grade | |
---|---|---|---|---|
0 | 95.00 | 85.500 | 1 | 90.5 |
1 | 82.50 | 82.475 | 0 | 78.2 |
2 | 75.30 | 65.300 | 1 | 65.8 |
3 | 91.20 | 88.900 | 1 | 88.3 |
4 | 85.94 | 72.200 | 0 | 72.4 |
5 | 88.60 | 92.100 | 0 | 91.0 |
6 | 92.30 | 94.100 | 1 | 94.3 |
7 | 85.70 | 83.000 | 1 | 83.5 |
8 | 70.10 | 70.000 | 0 | 68.6 |
10 | 78.90 | 79.700 | 1 | 79.7 |
11 | 94.20 | 91.400 | 0 | 90.1 |
12 | 85.94 | 80.100 | 1 | 80.5 |
13 | 86.70 | 87.400 | 0 | 87.4 |
14 | 90.80 | 82.475 | 1 | 82.3 |