Safe drinking water is a basic human right, whether used for drinking, food production, or basic hygiene. Although 70% of the earth’s surface is covered in water, there are still countries and communities that lack easy access to safe drinking water. Unsafe drinking water can be contaminated with hazardous chemical and animal waste, and dangerous amounts of naturally occurring substances.
Untreated water can pose a health risk, and cause the distribution of infectious diseases or even death. It is to the advantage of a country to provide access to safe water, as it has shown to positively impact economic growth.
This dataset analyzes several water quality properties to determine the potability of water. For this case, I will be conducting exploratory data analysis on this dataset and learn about what defines safe drinking water.
# Import all necessary Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
# Load Dataset
df = pd.read_csv("water_potability.csv")
# Quick analysis of data frame
df.shape
(3276, 10)
# First ten values on table
df.head(10)
# Descriptive Statistics for each water parameter
df.describe()
# Let's check for null value percentage
((df.isnull().sum() / df.shape[0]) * 100).sort_values(ascending = False)
Sulfate 23.840049
ph 14.987790
Trihalomethanes 4.945055
Hardness 0.000000
Solids 0.000000
Chloramines 0.000000
Conductivity 0.000000
Organic_carbon 0.000000
Turbidity 0.000000
Potability 0.000000
dtype: float64
# Replace Null Values for Sulfates with Mean Value of Sulfates
S_mean = df.Sulfate.mean()
df.Sulfate.fillna(S_mean, inplace = True)
# Replace Null Value for pH with Mean Value of pH
ph_mean = df.ph.mean()
df.ph.fillna(ph_mean, inplace = True)
# Replace Null Value for Trihalomethanes with Mean Value of Trihalomethanes
Tri_mean = df.Trihalomethanes.mean()
df.Trihalomethanes.fillna(Tri_mean, inplace = True)
((df.isnull().sum() / df.shape[0]) * 100).sort_values(ascending = False)
ph 0.0
Hardness 0.0
Solids 0.0
Chloramines 0.0
Sulfate 0.0
Conductivity 0.0
Organic_carbon 0.0
Trihalomethanes 0.0
Turbidity 0.0
Potability 0.0
dtype: float64
#Potability pie chart
labels = ['Potable', 'Non-Potable']
explode = (0.05,0) # Explode pie chart
colors = ['lightskyblue', 'lightcoral']
plt.rcParams['font.sans-serif']='Arial'
plt.rcParams['font.size']=12
sizes = [df.Potability.value_counts()[1], df.Potability.value_counts()[0]]
fig1, ax1 = plt.subplots()
ax1.pie(sizes, labels=labels, autopct='%1.1f%%', explode=explode, colors=colors)
ax1.axis('equal')
ax1.legend(frameon=False, bbox_to_anchor=(0.8,0.8))
plt.show()
# Compare the water parameter with regards to Potability
ph_0 = df.ph[df.Potability == 0]
ph_1 = df.ph[df.Potability == 1]
# Create figure to compare potable and non-potable values, add annotations and add vertical lines for annotations
plt.figure(figsize=(13,5))
plt.subplot(1,2,1)
sns.histplot(data=ph_0, color='crimson',alpha=0.5, label="Non-Potable", kde=True)
sns.histplot(data=ph_1, color='aqua', alpha=0.5, label="Potable", kde=True)
plt.axvline(x=7, linewidth=1.2, color='black', linestyle='--')
plt.title("Comparison of the Distribution of pH Values")
plt.xlabel('pH')
plt.annotate("< 7 (Acidic)", xy=(3, 300), size=11)
plt.annotate("> 7 (Basic)", xy=(8.5, 300), size=11)
plt.legend()
# Create histogram for all parameter values
plt.subplot(1,2,2)
sns.histplot(data=df, x='ph', color="blueviolet", kde=True)
plt.axvline(x=7, linewidth=1.2, color='black', linestyle='--')
plt.title("Distribution of pH Values")
plt.xlabel('pH')
plt.annotate("< 7 (Acidic)", xy=(3, 450))
plt.annotate("> 7 (Basic)", xy=(8.5, 450))
plt.show()
# Compare the water parameter with regards to Potability
hd_0 = df.Hardness[df.Potability == 0]
hd_1 = df.Hardness[df.Potability == 1]
# Create figure to compare potable and non-potable values, add annotations and add vertical lines for annotations
plt.figure(figsize=(13,5))
plt.subplot(1,2,1)
sns.histplot(data=hd_0, color='crimson',alpha=0.5, label="Non-Potable", kde=True)
sns.histplot(data=hd_1, color='aqua', alpha=0.5, label="Potable", kde=True)
plt.title("Comparison of the Distribution of Hardness Values")
plt.xlabel("Hardness (mg/L)")
plt.annotate("61 to 120\nSlightly Hard", xy=(65, 125), size=8.5)
plt.annotate("> 180\nVery Hard", xy=(250, 125), size=8.5)
plt.axvline(x=61, linewidth=1.2, color='black', linestyle='--')
plt.axvline(x=120, linewidth=1.2, color='black', linestyle='--')
plt.axvline(x=180, linewidth=1.2, color='black', linestyle='--')
plt.legend()
# Create histogram for all parameter values
plt.subplot(1,2,2)
sns.histplot(data=df, x='Hardness', color="blueviolet", kde=True)
plt.title("Distribution of Hardness Values")
plt.xlabel("Hardness (mg/L)")
plt.annotate("61 to 120\nSlightly Hard", xy=(65, 175), size=8.5)
plt.annotate(" > 180\nVery Hard", xy=(250, 175), size=8.5)
plt.axvline(x=61, linewidth=1.2, color='black', linestyle='--')
plt.axvline(x=120, linewidth=1.2, color='black', linestyle='--')
plt.axvline(x=180, linewidth=1.2, color='black', linestyle='--')
plt.show()
# Percentage of water that is considered very hard
pct_vhard = ((df.Hardness > 180).sum() / df.shape[0]) * 100
print(pct_vhard)
71.45909645909646
# Compare the water parameter with regards to Potability
tds_0 = df.Solids[df.Potability == 0]
tds_1 = df.Solids[df.Potability == 1]
# Create figure to compare potable and non-potable values, add annotations and add vertical lines for annotations
plt.figure(figsize=(13,5))
plt.subplot(1,2,1)
sns.histplot(data=tds_0, color="crimson",alpha=0.5, label="Non-Potable", kde=True)
sns.histplot(data=tds_1, color="aqua", alpha=0.5, label="Potable", kde=True)
plt.title("Comparison of the Distribution of TDS Values")
plt.xlabel("Solids (ppm)")
plt.axvline(x=1000, linewidth=1.2, color='black', linestyle='--')
plt.annotate(">1000 \nWHO max\ncontamination\nlevel", xy=(1600, 150), size=8.5)
plt.legend()
# Create histogram for all parameter values
plt.subplot(1,2,2)
sns.histplot(data=df, x='Solids', color="blueviolet", kde=True)
plt.title("Distribution of TDS Values")
plt.xlabel("Solids (ppm)")
plt.axvline(x=1000, linewidth=1.2, color='black', linestyle='--')
plt.annotate(">1000 \nWHO max\ncontamination\nlevel", xy=(1600, 225), size=8.5)
plt.show()
# Percent of water that exceeds WHO's TDS limit
pct_tds = ((df.Solids > 1000).sum() / df.shape[0]) * 100
print(pct_tds)
99.93894993894995
# Compare the water parameter with regards to Potability
chl_0 = df.Chloramines[df.Potability == 0]
chl_1 = df.Chloramines[df.Potability == 1]
# Create figure to compare potable and non-potable values, add annotations and add vertical lines for annotations
plt.figure(figsize=(13,5))
plt.subplot(1,2,1)
sns.histplot(data=chl_0, color="crimson",alpha=0.5, label="Non-Potable", kde=True)
sns.histplot(data=chl_1, color="aqua", alpha=0.5, label="Potable", kde=True)
plt.title("Comparison of the Distribution of Chloramines Values")
plt.xlabel("Chloramines (ppm)")
plt.axvline(x=4, linewidth=1.2, color='black', linestyle='--')
plt.annotate("<4 Safe to drink", xy=(.5, 125), size=11)
plt.legend()
# Create histogram for all parameter values
plt.subplot(1,2,2)
sns.histplot(data=df, x='Chloramines', color="blueviolet", kde=True)
plt.title("Distribution of Chloramines Values")
plt.xlabel("Chloramines (ppm)")
plt.axvline(x=4, linewidth=1.2, color='black', linestyle='--')
plt.annotate("<4 Safe to drink", xy=(.5, 170), size=11)
plt.show()
# Water percentage safe to drink in regards to Chloramine values
pct_cl = (df.Chloramines < 4).sum() / df.shape[0] * 100
print(pct_cl)
2.716727716727717
# Compare the water parameter with regards to Potability
sf_0 = df.Sulfate[df.Potability == 0]
sf_1 = df.Sulfate[df.Potability == 1]
# Create figure to compare potable and non-potable values, add annotations and add vertical lines for annotations
plt.figure(figsize=(13,5))
plt.subplot(1,2,1)
sns.histplot(data=sf_0, color="crimson",alpha=0.5, label="Non-Potable")
sns.histplot(data=sf_1, color="aqua", alpha=0.5, label="Potable")
plt.title("Comparison of the Distribution of Sulfate Values")
plt.xlabel("Sulfate (ppm)")
plt.axvline(x=250, linewidth=1.2, color='black', linestyle='--')
plt.annotate("<250 Ideal", xy=(180, 350), size=11)
plt.axvline(x=500, linewidth=1.2, color='black', linestyle='--')
plt.annotate("<500 Good", xy=(425, 350), size=11)
plt.legend()
# Create histogram for all parameter values
plt.subplot(1,2,2)
sns.histplot(data=df, x='Sulfate', color="blueviolet")
plt.title("Distribution of Sulfate Values")
plt.xlabel("Sulfate (ppm)")
plt.axvline(x=250, linewidth=1.2, color='black', linestyle='--')
plt.annotate("<250 Ideal", xy=(180, 525), size=11)
plt.axvline(x=500, linewidth=1.2, color='black', linestyle='--')
plt.annotate("<500 Good", xy=(425, 525), size=11)
plt.show()
# Calculate percentage of water that has ideal levels of sulfate
pct_sf = ((df.Sulfate < 250).sum() / df.shape[0]) * 100
print(pct_sf)
1.7704517704517704
# Compare the water parameter with regards to Potability
cnd_0 = df.Conductivity[df.Potability == 0]
cnd_1 = df.Conductivity[df.Potability == 1]
# Create figure to compare potable and non-potable values, add annotations and add vertical lines for annotations
plt.figure(figsize=(13,5))
plt.subplot(1,2,1)
sns.histplot(data=cnd_0, color="crimson",alpha=0.5, label="Non-Potable", kde=True)
sns.histplot(data=cnd_1, color="aqua", alpha=0.5, label="Potable", kde=True)
plt.title("Comparison of the Distribution of Conductivity Values")
plt.xlabel("Conductivity (μS/cm)")
plt.axvline(x=400, linewidth=1.2, color='black', linestyle='--')
plt.annotate("<400 Considered\nsafe to drink", xy=(180, 125), size=11)
plt.legend()
# Create histogram for all parameter values
plt.subplot(1,2,2)
sns.histplot(data=df, x='Conductivity', color="blueviolet", kde=True)
plt.title("Distribution of Conductivity Values")
plt.xlabel("Conductivity (μS/cm)")
plt.axvline(x=400, linewidth=1.2, color='black', linestyle='--')
plt.annotate("<400 Considered\nsafe to drink", xy=(180, 175), size=11)
plt.show()
pct_cnd = ((df.Conductivity < 400).sum() / df.shape[0]) * 100
print(pct_cnd)
40.10989010989011
# Compare the water parameter with regards to Potability
org_0 = df.Organic_carbon[df.Potability == 0]
org_1 = df.Organic_carbon[df.Potability == 1]
# Create figure to compare potable and non-potable values, add annotations and add vertical lines for annotations
plt.figure(figsize=(13,5))
plt.subplot(1,2,1)
sns.histplot(data=org_0, color="crimson",alpha=0.5, label="Non-Potable", kde=True)
sns.histplot(data=org_1, color="aqua", alpha=0.5, label="Potable", kde=True)
plt.title("Comparison of the Distribution\nof Organic Carbon Values")
plt.xlabel("Organic Carbon (ppm)")
plt.axvline(x=2, linewidth=1.2, color='black', linestyle='--')
plt.annotate("<2 is the standard\nfor treated water", xy=(3.2, 125), size=11)
plt.legend()
# Create histogram for all parameter values
plt.subplot(1,2,2)
sns.histplot(data=df, x='Organic_carbon', color="blueviolet", kde=True)
plt.title("Distribution of Organic Carbon Values")
plt.xlabel("Organic Carbon (ppm)")
plt.axvline(x=2, linewidth=1.2, color='black', linestyle='--')
plt.annotate("<2 is the standard\nfor treated water", xy=(3.2, 175), size=11)
plt.show()
# What percent of the water tested was under 2 ppm?
pct_org = ((df.Organic_carbon > 2).sum() / df.shape[0]) * 100
print(pct_org)
100.0
# Compare the water parameter with regards to Potability
tri_0 = df.Trihalomethanes[df.Potability == 0]
tri_1 = df.Trihalomethanes[df.Potability == 1]
# Create figure to compare potable and non-potable values, add annotations and add vertical lines for annotations
plt.figure(figsize=(13,5))
plt.subplot(1,2,1)
sns.histplot(data=df, x=tri_0, color="crimson",alpha=0.5, label="Non-Potable", kde=True)
sns.histplot(data=df, x=tri_1, color="aqua", alpha=0.5, label="Potable", kde=True)
plt.title("Comparison of the Distribution of\nTrihalomethanes Values")
plt.xlabel("Trihalomethanes (μg/L)")
plt.axvline(x=80, linewidth=1.2, color='black', linestyle='--')
plt.annotate("<80 is considered\nsafe to drink", xy=(90, 200), size=11)
plt.legend()
# Create histogram for all parameter values
plt.subplot(1,2,2)
sns.histplot(data=df, x='Trihalomethanes', color="blueviolet", kde=True)
plt.title("Distribution of Trihalomethanes Values")
plt.xlabel("Trihalomethanes (μg/L)")
plt.axvline(x=80, linewidth=1.2, color='black', linestyle='--')
plt.annotate("<80 is considered\nsafe to drink", xy=(90, 200), size=11)
plt.show()
# What percent of water was considered safe to drink, under 80 micrograms/L?
((df.Trihalomethanes < 80).sum() / df.shape[0]) * 100
81.62393162393163
# What percent of water considered potable was considered not safe to drink with regards to Trihalomethanes?
tri = df.Trihalomethanes
((tri_1 > 80).sum() / tri.shape[0]) * 100
7.264957264957266
# Compare the water parameter with regards to Potability
tur_0 = df.Turbidity[df.Potability == 0]
tur_1 = df.Turbidity[df.Potability == 1]
# Create figure to compare potable and non-potable values, add annotations and add vertical lines for annotations
plt.figure(figsize=(13,5))
plt.subplot(1,2,1)
sns.histplot(data=df, x=tur_0, color="crimson",alpha=0.5, label="Non-Potable", kde=True)
sns.histplot(data=df, x=tur_1, color="aqua", alpha=0.5, label="Potable", kde=True)
plt.title("Comparison of the Distribution of\nTurbidity Values")
plt.xlabel("Turbidity (NTU)")
plt.axvline(x=5, linewidth=1.2, color='black', linestyle='--')
plt.annotate("<5 is highly\nrecommended", xy=(5.2, 125), size=11)
plt.legend()
# Create histogram for all parameter values
plt.subplot(1,2,2)
sns.histplot(data=df, x='Turbidity', color="blueviolet", kde=True)
plt.title("Distribution of Turbidity Values")
plt.xlabel("Turbidity (NTU)")
plt.axvline(x=5, linewidth=1.2, color='black', linestyle='--')
plt.annotate("<5 is highly\nrecommended", xy=(5.2, 175), size=11)
plt.show()
# What percent of water was under the 5 NTU limit?
pct_tur = ((df.Turbidity < 5).sum() / df.shape[0]) * 100
print(pct_tur)
90.41514041514041
# Create Boxplot for all parameters
plt.figure(figsize=(15,15))
for count, column in enumerate(list(df.columns[0:9])):
plt.subplot(3, 3, count+1)
df.boxplot(column)
# Correlation Heatmap
plt.figure(figsize=(13,6))
heatmap = sns.heatmap(df.corr(), annot=True, cmap='Blues')
heatmap.set_title('Correlation Heatmap')
Text(0.5, 1.0, 'Correlation Heatmap')