1

 

Step 1: Import Required Libraries

import pandas as pd

from sklearn.datasets import fetch_california_housing

california_housing = fetch_california_housing()

print(california_housing.DESCR)

Step 2: Convert to DataFrame

df = pd.DataFrame(california_housing.data, columns=california_housing.feature_names)

Step 3: Include target variable in the DataFrame

df['target'] = california_housing.target

Step 4: View the first few rows of the dataset using the head() method

print("First 5 rows of the dataset:")

print(df.head()) # Display the first 5 rows

Step 5: # Plot histograms for all numerical features

def plot_histograms(df):

df.hist(bins=30, figsize=(12, 10))

plt.suptitle("Histograms of Numerical Features", fontsize=16)

plt.show()

Step6:# Plot box plots for all numerical features to detect outliers

def plot_boxplots(df):

plt.figure(figsize=(12, 10))

for i, feature in enumerate(df.columns):

plt.subplot(3, 4, i+1) # Adjust the number of rows and columns accordingly

sns.boxplot(df[feature])

plt.title(f'Box Plot of {feature}')

plt.tight_layout()

plt.show()

Step 7:# Calling the functions

import matplotlib.pyplot as plt

plot_histograms(df)

plot_boxplots(df)

Step 8: Identify outliers using the IQR method

print("Outliers Detection:")

outliers_summary = {}

numerical_features = df.select_dtypes(include=['float64', 'int64']).columns.tolist()

for feature in numerical_features:

Q1 = df[feature].quantile(0.25)

Q3 = df[feature].quantile(0.75)

IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR

upper_bound = Q3 + 1.5 * IQR

outliers = df[(df[feature] < lower_bound) | (df[feature] > upper_bound)]

outliers_summary[feature] = len(outliers)

print(f"{feature}: {len(outliers)} outliers")

Comments

Popular posts from this blog

3

2