GROUP B
ASSIGNMENT NO 1
PERFORM BASIC OPERATION FACEBOOK
PROGRAM -
import pandas as pd
import numpy as np
fb = pd.read_csv(r'dataset_Facebook.csv', sep=";")
# Part A --> Create data subsets
fb1 = fb[['Page total likes', 'Category', 'Post Month', 'Post Weekday']].loc[0:15]
print(fb1)
fb2 = fb[['Page total likes', 'Category', 'Post Month', 'Post Weekday']].loc[16:30]
print(fb2)
fb3 = fb[['Page total likes', 'Category', 'Post Month', 'Post Weekday']].loc[31:50]
print(fb3)
# Part B --> Merge Data
merging = pd.concat([fb1, fb2, fb3])
print(merging)
# Part C --> Sort Data
sort_values = fb.sort_values('Page total likes', ascending=True)
print(sort_values)
# Part D --> Transpose of data
print(fb.transpose())
# Part E --> Shaping Reshaping
shaping = fb.shape
print(shaping)
pivot_table = pd.pivot_table(fb, index=['Type', 'Category'], values='comment')
print(pivot_table)
# reshaping using array
reshaping_arr = np.array([1, 2, 3, 4, 5, 6])
reshaping_arr.reshape(3, 2)
ASSIGNMENT NO 2
PERFORM BASIC PERATION HEART/AIR QUALITY
PROGRAM -
import pandas as pd
import numpy as np
# A) Data Cleaning
aq = pd.read_csv('airquality.csv')
hrt = pd.read_csv('heart.csv')
# Handle missing values in Air Quality dataset
aq['Ozone'].fillna(aq['Ozone'].mean(), inplace=True)
aq['Solar.R'].fillna(aq['Solar.R'].median(), inplace=True)
# Removing duplicate values
aq.drop_duplicates(inplace=True)
# Remove outliers in Air Quality dataset
aq = aq[(aq['Ozone'] >= 0) & (aq['Ozone'] <= 200)]
# Standardize the Day column in Air Quality dataset
aq['Day'] = pd.to_datetime(aq['Day'])
# Handle missing values in Heart Diseases dataset
hrt.fillna(hrt.median(), inplace=True)
# printing the cleaned dataset
print(aq)
print(hrt)
# b) Data Integration
# Concatenate the datasets vertically
merged_data = pd.concat([aq, hrt], axis=1)
print(merged_data)
# c) Data Transformation
# Normalization of data min max
columns_to_normalize = ['Ozone', 'Solar.R', 'Wind', 'Temp', 'Month', 'Day']
merged_data[columns_to_normalize] = merged_data[columns_to_normalize].apply(lambda x: (x - x.min()) / (x.max() - x.min()))
# Z-Score Normalization
def ZScore_normalize(x):
return (x - x.mean()) / x.std()
# Apply the transformation function to the specified columns
aq[columns_to_normalize] = aq[columns_to_normalize].apply(ZScore_normalize)
# Converting Datatypes
print(aq['Ozone'].dtype)
print(aq['Solar.R'].dtype)
## get dummies for categorical data
# Create separate arrays for each column
Ozone = [41, 36, 12, 18, 37, 29, 23]
Solar_R = [190, 118, 149, 313, None, 299, 99]
Wind = [7.4, 8.0, 12.6, 11.5, 12.6, 10.9, 13.8]
Temp = [67, 72, 74, 62, 65, 66, 68]
Month = [5, 5, 5, 5, 6, 6, 6]
Day = [1, 2, 3, 4, 5, 6, 7]
# Create a dictionary with column names as keys and corresponding arrays as values
data = {'Ozone': Ozone, 'Solar.R': Solar_R, 'Wind': Wind, 'Temp': Temp, 'Month': Month, 'Day': Day}
# Create the DataFrame
aqdf = pd.DataFrame(data)
# Display the DataFrame
print(aqdf)
# d) Error Correcting
# Replace inconsistent values with desired format
merged_data.replace('missing_value', np.nan, inplace=True)
# Convert columns to numeric data type
merged_data['Ozone'] = pd.to_numeric(merged_data['Ozone'])
merged_data['Solar.R'] = pd.to_numeric(merged_data['Solar.R'])
# Verify the changes
print(aq['Ozone'].dtype)
print(aq['Solar.R'].dtype)
ASSIGNMENT NO 3 A
VISUALIZE
A.FACEBOOK
PROGRAM -
import pandas as pd
import numpy as np
# matplotlib library to do visualization
import matplotlib.pyplot as plt
#os.chdir("D:\Dataset")
# treat nan missing values as nan
fb=pd.read_csv(r"C:\Users\VISHAL\Desktop\sem-VI\Lab\DSBDA\DataSets\DataSets\fb.csv",index_col=0,na_values=['??','????'])
fb1=fb.copy(deep=True)
# Observe shape and values of dataset
fb.shape
fb.info()
fb.isnull().sum()
# removing nan valuse
fb.dropna(axis=0,inplace=True)
fb.size
fb.shape
## ---- Dada Visualization using matplotlib Library -------------------
## SCATTER PLOT
plt.scatter(fb['like'],fb['share'],c='red')
plt.title("Scatter Plot total page likes vs Catagory")
plt.xlabel('Agee in months')
plt.ylabel('Price in Dollars')
plt.show()
plt.hist(fb["comment"])
# histogram with default arguments
plt.hist(fb['comment'],color='blue', edgecolor='white',bins=5)
plt.hist(fb['comment'],color='blue', edgecolor='white',bins=8)
# bins specify the count of distribution range
plt.title("Histogram of comments")
plt.xlabel('comments')
plt.ylabel('Frequency')
plt.show()
fb['paid'].value_counts() # get count of categorical variable
counts=fb['paid'].value_counts()
paid=(0, 1)
index=np.arange(len(paid))
#counts=[50,100,75]
plt.bar(index,counts,color=['red','green'])
plt.title("Bar Plot of Fuel Type")
plt.xlabel('Fuel Used')
plt.ylabel('Frequency')
# Bar label
#plt.xticks(index,paids)
plt.xticks(index,paid,rotation=90)
plt.show()
ASSIGNMENT NO 3 B
B.TOYOTA
PROGRAM-
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Read the CSV file
cars_data = pd.read_csv('Toyota.csv', index_col=0, na_values=['??', '????'])
print(cars_data.shape)
# Drop rows with missing values
cars_data.dropna(axis=0, inplace=True)
# Scatter Plot using matplotlib
plt.scatter(cars_data['Age'], cars_data['Price'], c='blue')
plt.title("Scatter Plot Car Price vs Age")
plt.xlabel('Age in months')
plt.ylabel('Price in Dollars')
plt.show()
# Histogram
plt.hist(cars_data['KM'], color='blue', edgecolor='white', bins=5)
plt.title("Histogram of Kilometer run")
plt.xlabel('Kilometers')
plt.ylabel('Frequency')
plt.show()
# Bar Plot
fuelTypes = ('Petrol', 'Diesel', 'CNG')
counts = cars_data['FuelType'].value_counts()
index = np.arange(len(fuelTypes))
plt.bar(index, counts, color=['red', 'green', 'cyan'])
plt.title("Bar Plot of Fuel Type")
plt.xlabel('Fuel Used')
plt.ylabel('Frequency')
plt.xticks(index, fuelTypes, rotation=90)
plt.show()
# Scatter Plot using seaborn
sns.set(style='darkgrid')
sns.regplot(x=cars_data['Age'], y=cars_data['Price'])
plt.show()
# Histogram using seaborn
sns.displot(cars_data['Age'])
plt.show()
# Bar Plot using seaborn
sns.countplot(x='FuelType', data=cars_data)
plt.show()
# Box Plot using seaborn
sns.boxplot(y=cars_data['Price'])
plt.show()
sns.boxplot(y=cars_data['Price'], x=cars_data['FuelType'])
plt.show()

0 Comments