DSBDA




                                                                            GROUP B


                                                              ASSIGNMENT NO 1

PERFORM BASIC OPERATION FACEBOOK

PROGRAM -

import pandas as pd

import numpy as np


fb = pd.read_csv(r'dataset_Facebook.csv', sep=";")


# Part A --> Create data subsets


fb1 = fb[['Page total likes', 'Category', 'Post Month', 'Post Weekday']].loc[0:15]

print(fb1)


fb2 = fb[['Page total likes', 'Category', 'Post Month', 'Post Weekday']].loc[16:30]

print(fb2)


fb3 = fb[['Page total likes', 'Category', 'Post Month', 'Post Weekday']].loc[31:50]

print(fb3)


# Part B --> Merge Data


merging = pd.concat([fb1, fb2, fb3])

print(merging)


# Part C --> Sort Data


sort_values = fb.sort_values('Page total likes', ascending=True)

print(sort_values)


# Part D --> Transpose of data


print(fb.transpose())


# Part E --> Shaping Reshaping


shaping = fb.shape

print(shaping)


pivot_table = pd.pivot_table(fb, index=['Type', 'Category'], values='comment')

print(pivot_table)


# reshaping using array


reshaping_arr = np.array([1, 2, 3, 4, 5, 6])

reshaping_arr.reshape(3, 2)


                                                      ASSIGNMENT NO 2

PERFORM BASIC PERATION HEART/AIR QUALITY

PROGRAM -


import pandas as pd

import numpy as np


# A) Data Cleaning

aq = pd.read_csv('airquality.csv')

hrt = pd.read_csv('heart.csv')


# Handle missing values in Air Quality dataset

aq['Ozone'].fillna(aq['Ozone'].mean(), inplace=True)

aq['Solar.R'].fillna(aq['Solar.R'].median(), inplace=True)


# Removing duplicate values

aq.drop_duplicates(inplace=True)


# Remove outliers in Air Quality dataset

aq = aq[(aq['Ozone'] >= 0) & (aq['Ozone'] <= 200)]


# Standardize the Day column in Air Quality dataset

aq['Day'] = pd.to_datetime(aq['Day'])


# Handle missing values in Heart Diseases dataset

hrt.fillna(hrt.median(), inplace=True)


# printing the cleaned dataset

print(aq)

print(hrt)


# b) Data Integration

# Concatenate the datasets vertically

merged_data = pd.concat([aq, hrt], axis=1)

print(merged_data)


# c) Data Transformation

# Normalization of data min max

columns_to_normalize = ['Ozone', 'Solar.R', 'Wind', 'Temp', 'Month', 'Day']

merged_data[columns_to_normalize] = merged_data[columns_to_normalize].apply(lambda x: (x - x.min()) / (x.max() - x.min()))


# Z-Score Normalization

def ZScore_normalize(x):

    return (x - x.mean()) / x.std()

# Apply the transformation function to the specified columns

aq[columns_to_normalize] = aq[columns_to_normalize].apply(ZScore_normalize)


# Converting Datatypes

print(aq['Ozone'].dtype)

print(aq['Solar.R'].dtype)


## get dummies for categorical data


# Create separate arrays for each column

Ozone = [41, 36, 12, 18, 37, 29, 23]

Solar_R = [190, 118, 149, 313, None, 299, 99]

Wind = [7.4, 8.0, 12.6, 11.5, 12.6, 10.9, 13.8]

Temp = [67, 72, 74, 62, 65, 66, 68]

Month = [5, 5, 5, 5, 6, 6, 6]

Day = [1, 2, 3, 4, 5, 6, 7]


# Create a dictionary with column names as keys and corresponding arrays as values

data = {'Ozone': Ozone, 'Solar.R': Solar_R, 'Wind': Wind, 'Temp': Temp, 'Month': Month, 'Day': Day}


# Create the DataFrame

aqdf = pd.DataFrame(data)


# Display the DataFrame

print(aqdf)


# d) Error Correcting

# Replace inconsistent values with desired format

merged_data.replace('missing_value', np.nan, inplace=True)


# Convert columns to numeric data type

merged_data['Ozone'] = pd.to_numeric(merged_data['Ozone'])

merged_data['Solar.R'] = pd.to_numeric(merged_data['Solar.R'])

# Verify the changes

print(aq['Ozone'].dtype)

print(aq['Solar.R'].dtype)


                                                      ASSIGNMENT NO 3 A

VISUALIZE

A.FACEBOOK

PROGRAM -


import pandas as pd 

import numpy as np

# matplotlib library to do visualization

import matplotlib.pyplot as plt


#os.chdir("D:\Dataset")


# treat nan missing values as nan

fb=pd.read_csv(r"C:\Users\VISHAL\Desktop\sem-VI\Lab\DSBDA\DataSets\DataSets\fb.csv",index_col=0,na_values=['??','????'])

fb1=fb.copy(deep=True)

# Observe shape and values of dataset

fb.shape

fb.info()

fb.isnull().sum()


# removing nan valuse

fb.dropna(axis=0,inplace=True)

fb.size

fb.shape


## ---- Dada Visualization using matplotlib Library -------------------


## SCATTER PLOT

plt.scatter(fb['like'],fb['share'],c='red')

plt.title("Scatter Plot total page likes vs Catagory")

plt.xlabel('Agee in months')

plt.ylabel('Price in Dollars')

plt.show()


plt.hist(fb["comment"])

# histogram with default arguments

plt.hist(fb['comment'],color='blue', edgecolor='white',bins=5)

plt.hist(fb['comment'],color='blue', edgecolor='white',bins=8)

 # bins specify the count of distribution range

plt.title("Histogram of comments")

plt.xlabel('comments')

plt.ylabel('Frequency')

plt.show()


fb['paid'].value_counts() # get count of categorical variable 

counts=fb['paid'].value_counts()

paid=(0, 1)

index=np.arange(len(paid))

#counts=[50,100,75]


plt.bar(index,counts,color=['red','green'])


plt.title("Bar Plot of Fuel Type")

plt.xlabel('Fuel Used')

plt.ylabel('Frequency')


# Bar label

#plt.xticks(index,paids)

plt.xticks(index,paid,rotation=90)

plt.show()


                                                          ASSIGNMENT NO 3 B

B.TOYOTA

PROGRAM-


import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

import seaborn as sns


# Read the CSV file

cars_data = pd.read_csv('Toyota.csv', index_col=0, na_values=['??', '????'])

print(cars_data.shape)


# Drop rows with missing values

cars_data.dropna(axis=0, inplace=True)


# Scatter Plot using matplotlib

plt.scatter(cars_data['Age'], cars_data['Price'], c='blue')

plt.title("Scatter Plot Car Price vs Age")

plt.xlabel('Age in months')

plt.ylabel('Price in Dollars')

plt.show()


# Histogram

plt.hist(cars_data['KM'], color='blue', edgecolor='white', bins=5)

plt.title("Histogram of Kilometer run")

plt.xlabel('Kilometers')

plt.ylabel('Frequency')

plt.show()


# Bar Plot

fuelTypes = ('Petrol', 'Diesel', 'CNG')

counts = cars_data['FuelType'].value_counts()

index = np.arange(len(fuelTypes))


plt.bar(index, counts, color=['red', 'green', 'cyan'])

plt.title("Bar Plot of Fuel Type")

plt.xlabel('Fuel Used')

plt.ylabel('Frequency')

plt.xticks(index, fuelTypes, rotation=90)

plt.show()


# Scatter Plot using seaborn

sns.set(style='darkgrid')

sns.regplot(x=cars_data['Age'], y=cars_data['Price'])

plt.show()


# Histogram using seaborn

sns.displot(cars_data['Age'])

plt.show()


# Bar Plot using seaborn

sns.countplot(x='FuelType', data=cars_data)

plt.show()


# Box Plot using seaborn

sns.boxplot(y=cars_data['Price'])

plt.show()

sns.boxplot(y=cars_data['Price'], x=cars_data['FuelType'])

plt.show()





Post a Comment

0 Comments