6. ML

 import pandas as pd

csv_data = pd.read_csv('customers.csv')

excel_data = pd.read_excel('customers.xlsx' , engine = 'openpyxl')


import json

with open('customers.json', 'r') as json_file:

    json_data = json.load(json_file)

json_data = pd.DataFrame(json_data)


print("CSV Data:")

print(csv_data.head())


print("\nExcell Data:")

print(excel_data.head())


print("n\Json Data:")

print(json_data.head())


print(csv_data.info())

print(csv_data.isnull().sum())

print(excel_data.info())

print(excel_data.isnull().sum())

print(json_data.info())

print(json_data.isnull().sum())



#replacing missing values in csv data

csv_data.fillna(0, inplace=True)

csv_data.drop_duplicates(inplace=True)


excel_data.fillna(0, inplace=True)

excel_data.drop_duplicates(subset=['first_name', 'last_name'],inplace=True)


json_data.fillna(0, inplace=True)

json_data.drop_duplicates(subset=['first_name', 'last_name'],inplace=True)


print("Cleaned CSV data:")

print(csv_data.head())


print("\nCleaned excell data:")

print(excel_data.head())


print("\nCleaned json data:")

print(json_data.head())


#combine the data into common data frame

common_df = pd.concat([csv_data, excel_data, json_data], ignore_index=True)


common_df.reset_index(drop=True, inplace=True)


print("Unfilled data")

print(common_df.head())


#5check if column are consistent across datasets

if not all(csv_data.columns == excel_data.columns) or not all(csv_data.columns == json_data.columns):

    print("Columns are not consistent across datasets.")

else:

    common_df = pd.concat([csv_data, excel_data, json_data], ignore_index=True)

    common_df['phone_area_code']=common_df['phone'].str.extract(r'(\d+)')

    common_df['total_spent'] = common_df['orders']*common_df['spent']

    print("transformed Data")

    print(common_df.head())

    

#6perform descriptive stats

desc_stats = common_df.describe()

agg_data = common_df.groupby('job').agg({'orders': 'sum', 'spent': 'mean'})

total_sales = common_df['spent'].sum()

average_order_value = common_df['spent'].mean()


product_distribution = common_df['job'].value_counts()


print("descriptive stats")

print(desc_stats)


print("\nAggragate data by job:")

print(agg_data)


print("\nTotal sales: ", total_sales)

print("\nAverage Order Value: ", average_order_value)


print("\nJob category distribution:")

print(product_distribution)


#7representaion 

import pandas as pd

import matplotlib.pyplot as plt

import seaborn as sns


plt.figure(figsize=(10,6))

sns.barplot(x='job', y='spent', data=common_df)

plt.title('Sales by product category')

plt.xlabel('product category')

plt.ylabel('total slales')

plt.xticks(rotation=45)

plt.show()


#piechart

product_distribution = common_df['job'].value_counts()

plt.figure(figsize=(6,6))

plt.pie(product_distribution, labels=product_distribution.index, autopct='%1.1f%%', startangle=140)

plt.title('job Category distribution')

plt.show()


#box plot

plt.figure(figsize=(8,6))

sns.boxplot(x='job', y='spent', data=common_df)

plt.title('Order Value Distribution by job')

plt.xlabel('Job')

plt.ylabel('Order Value')

plt.xticks(rotation=90)

plt.show()



Comments

Popular posts from this blog

DREAM LEAGUE SOCCER 2019 MOD

10 CL3

Garena Contra Return