6. ML
import pandas as pd
csv_data = pd.read_csv('customers.csv')
excel_data = pd.read_excel('customers.xlsx' , engine = 'openpyxl')
import json
with open('customers.json', 'r') as json_file:
json_data = json.load(json_file)
json_data = pd.DataFrame(json_data)
print("CSV Data:")
print(csv_data.head())
print("\nExcell Data:")
print(excel_data.head())
print("n\Json Data:")
print(json_data.head())
print(csv_data.info())
print(csv_data.isnull().sum())
print(excel_data.info())
print(excel_data.isnull().sum())
print(json_data.info())
print(json_data.isnull().sum())
#replacing missing values in csv data
csv_data.fillna(0, inplace=True)
csv_data.drop_duplicates(inplace=True)
excel_data.fillna(0, inplace=True)
excel_data.drop_duplicates(subset=['first_name', 'last_name'],inplace=True)
json_data.fillna(0, inplace=True)
json_data.drop_duplicates(subset=['first_name', 'last_name'],inplace=True)
print("Cleaned CSV data:")
print(csv_data.head())
print("\nCleaned excell data:")
print(excel_data.head())
print("\nCleaned json data:")
print(json_data.head())
#combine the data into common data frame
common_df = pd.concat([csv_data, excel_data, json_data], ignore_index=True)
common_df.reset_index(drop=True, inplace=True)
print("Unfilled data")
print(common_df.head())
#5check if column are consistent across datasets
if not all(csv_data.columns == excel_data.columns) or not all(csv_data.columns == json_data.columns):
print("Columns are not consistent across datasets.")
else:
common_df = pd.concat([csv_data, excel_data, json_data], ignore_index=True)
common_df['phone_area_code']=common_df['phone'].str.extract(r'(\d+)')
common_df['total_spent'] = common_df['orders']*common_df['spent']
print("transformed Data")
print(common_df.head())
#6perform descriptive stats
desc_stats = common_df.describe()
agg_data = common_df.groupby('job').agg({'orders': 'sum', 'spent': 'mean'})
total_sales = common_df['spent'].sum()
average_order_value = common_df['spent'].mean()
product_distribution = common_df['job'].value_counts()
print("descriptive stats")
print(desc_stats)
print("\nAggragate data by job:")
print(agg_data)
print("\nTotal sales: ", total_sales)
print("\nAverage Order Value: ", average_order_value)
print("\nJob category distribution:")
print(product_distribution)
#7representaion
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(10,6))
sns.barplot(x='job', y='spent', data=common_df)
plt.title('Sales by product category')
plt.xlabel('product category')
plt.ylabel('total slales')
plt.xticks(rotation=45)
plt.show()
#piechart
product_distribution = common_df['job'].value_counts()
plt.figure(figsize=(6,6))
plt.pie(product_distribution, labels=product_distribution.index, autopct='%1.1f%%', startangle=140)
plt.title('job Category distribution')
plt.show()
#box plot
plt.figure(figsize=(8,6))
sns.boxplot(x='job', y='spent', data=common_df)
plt.title('Order Value Distribution by job')
plt.xlabel('Job')
plt.ylabel('Order Value')
plt.xticks(rotation=90)
plt.show()
Comments
Post a Comment