1
Step 1: Import Required Libraries
import pandas as pd
from sklearn.datasets import fetch_california_housing
california_housing = fetch_california_housing()
print(california_housing.DESCR)
Step 2: Convert to DataFrame
df = pd.DataFrame(california_housing.data,
columns=california_housing.feature_names)
Step 3: Include target variable in the DataFrame
df['target'] = california_housing.target
Step 4: View the first few rows of the dataset using the head()
method
print("First 5 rows of the dataset:")
print(df.head()) # Display the first 5 rows
Step 5: # Plot histograms for all numerical features
def plot_histograms(df):
df.hist(bins=30, figsize=(12, 10))
plt.suptitle("Histograms of
Numerical Features", fontsize=16)
plt.show()
Step6:# Plot box plots for all numerical features to detect
outliers
def plot_boxplots(df):
plt.figure(figsize=(12, 10))
for i, feature in enumerate(df.columns):
plt.subplot(3, 4,
i+1) # Adjust the number of rows and columns accordingly
sns.boxplot(df[feature])
plt.title(f'Box Plot
of {feature}')
plt.tight_layout()
plt.show()
Step 7:# Calling the functions
import matplotlib.pyplot as plt
plot_histograms(df)
plot_boxplots(df)
Step 8: Identify outliers using the IQR method
print("Outliers Detection:")
outliers_summary = {}
numerical_features = df.select_dtypes(include=['float64',
'int64']).columns.tolist()
for feature in numerical_features:
Q1 = df[feature].quantile(0.25)
Q3 = df[feature].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers = df[(df[feature] <
lower_bound) | (df[feature] > upper_bound)]
outliers_summary[feature] =
len(outliers)
print(f"{feature}: {len(outliers)}
outliers")
Comments
Post a Comment