DSBDA

GROUP B

ASSIGNMENT NO 1

PERFORM BASIC OPERATION FACEBOOK

PROGRAM -

import pandas as pd

import numpy as np

fb = pd.read_csv(r'dataset_Facebook.csv', sep=";")

# Part A --> Create data subsets

fb1 = fb[['Page total likes', 'Category', 'Post Month', 'Post Weekday']].loc[0:15]

print(fb1)

fb2 = fb[['Page total likes', 'Category', 'Post Month', 'Post Weekday']].loc[16:30]

print(fb2)

fb3 = fb[['Page total likes', 'Category', 'Post Month', 'Post Weekday']].loc[31:50]

print(fb3)

# Part B --> Merge Data

merging = pd.concat([fb1, fb2, fb3])

print(merging)

# Part C --> Sort Data

sort_values = fb.sort_values('Page total likes', ascending=True)

print(sort_values)

# Part D --> Transpose of data

print(fb.transpose())

# Part E --> Shaping Reshaping

shaping = fb.shape

print(shaping)

pivot_table = pd.pivot_table(fb, index=['Type', 'Category'], values='comment')

print(pivot_table)

# reshaping using array

reshaping_arr = np.array([1, 2, 3, 4, 5, 6])

reshaping_arr.reshape(3, 2)

ASSIGNMENT NO 2

PERFORM BASIC PERATION HEART/AIR QUALITY

PROGRAM -

import pandas as pd

import numpy as np

# A) Data Cleaning

aq = pd.read_csv('airquality.csv')

hrt = pd.read_csv('heart.csv')

# Handle missing values in Air Quality dataset

aq['Ozone'].fillna(aq['Ozone'].mean(), inplace=True)

aq['Solar.R'].fillna(aq['Solar.R'].median(), inplace=True)

# Removing duplicate values

aq.drop_duplicates(inplace=True)

# Remove outliers in Air Quality dataset

aq = aq[(aq['Ozone'] >= 0) & (aq['Ozone'] <= 200)]

# Standardize the Day column in Air Quality dataset

aq['Day'] = pd.to_datetime(aq['Day'])

# Handle missing values in Heart Diseases dataset

hrt.fillna(hrt.median(), inplace=True)

# printing the cleaned dataset

print(aq)

print(hrt)

# b) Data Integration

# Concatenate the datasets vertically

merged_data = pd.concat([aq, hrt], axis=1)

print(merged_data)

# c) Data Transformation

# Normalization of data min max

columns_to_normalize = ['Ozone', 'Solar.R', 'Wind', 'Temp', 'Month', 'Day']

merged_data[columns_to_normalize] = merged_data[columns_to_normalize].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

# Z-Score Normalization

def ZScore_normalize(x):

return (x - x.mean()) / x.std()

# Apply the transformation function to the specified columns

aq[columns_to_normalize] = aq[columns_to_normalize].apply(ZScore_normalize)

# Converting Datatypes

print(aq['Ozone'].dtype)

print(aq['Solar.R'].dtype)

## get dummies for categorical data

# Create separate arrays for each column

Ozone = [41, 36, 12, 18, 37, 29, 23]

Solar_R = [190, 118, 149, 313, None, 299, 99]

Wind = [7.4, 8.0, 12.6, 11.5, 12.6, 10.9, 13.8]

Temp = [67, 72, 74, 62, 65, 66, 68]

Month = [5, 5, 5, 5, 6, 6, 6]

Day = [1, 2, 3, 4, 5, 6, 7]

# Create a dictionary with column names as keys and corresponding arrays as values

data = {'Ozone': Ozone, 'Solar.R': Solar_R, 'Wind': Wind, 'Temp': Temp, 'Month': Month, 'Day': Day}

# Create the DataFrame

aqdf = pd.DataFrame(data)

# Display the DataFrame

print(aqdf)

# d) Error Correcting

# Replace inconsistent values with desired format

merged_data.replace('missing_value', np.nan, inplace=True)

# Convert columns to numeric data type

merged_data['Ozone'] = pd.to_numeric(merged_data['Ozone'])

merged_data['Solar.R'] = pd.to_numeric(merged_data['Solar.R'])

# Verify the changes

print(aq['Ozone'].dtype)

print(aq['Solar.R'].dtype)

ASSIGNMENT NO 3 A

VISUALIZE

A.FACEBOOK

PROGRAM -

import pandas as pd

import numpy as np

# matplotlib library to do visualization

import matplotlib.pyplot as plt

#os.chdir("D:\Dataset")

# treat nan missing values as nan

fb=pd.read_csv(r"C:\Users\VISHAL\Desktop\sem-VI\Lab\DSBDA\DataSets\DataSets\fb.csv",index_col=0,na_values=['??','????'])

fb1=fb.copy(deep=True)

# Observe shape and values of dataset

fb.shape

fb.info()

fb.isnull().sum()

# removing nan valuse

fb.dropna(axis=0,inplace=True)

fb.size

fb.shape

## ---- Dada Visualization using matplotlib Library -------------------

## SCATTER PLOT

plt.scatter(fb['like'],fb['share'],c='red')

plt.title("Scatter Plot total page likes vs Catagory")

plt.xlabel('Agee in months')

plt.ylabel('Price in Dollars')

plt.show()

plt.hist(fb["comment"])

# histogram with default arguments

plt.hist(fb['comment'],color='blue', edgecolor='white',bins=5)

plt.hist(fb['comment'],color='blue', edgecolor='white',bins=8)

# bins specify the count of distribution range

plt.title("Histogram of comments")

plt.xlabel('comments')

plt.ylabel('Frequency')

plt.show()

fb['paid'].value_counts() # get count of categorical variable

counts=fb['paid'].value_counts()

paid=(0, 1)

index=np.arange(len(paid))

#counts=[50,100,75]

plt.bar(index,counts,color=['red','green'])

plt.title("Bar Plot of Fuel Type")

plt.xlabel('Fuel Used')

plt.ylabel('Frequency')

# Bar label

#plt.xticks(index,paids)

plt.xticks(index,paid,rotation=90)

plt.show()

ASSIGNMENT NO 3 B

B.TOYOTA

PROGRAM-

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

import seaborn as sns

# Read the CSV file

cars_data = pd.read_csv('Toyota.csv', index_col=0, na_values=['??', '????'])

print(cars_data.shape)

# Drop rows with missing values

cars_data.dropna(axis=0, inplace=True)

# Scatter Plot using matplotlib

plt.scatter(cars_data['Age'], cars_data['Price'], c='blue')

plt.title("Scatter Plot Car Price vs Age")

plt.xlabel('Age in months')

plt.ylabel('Price in Dollars')

plt.show()

# Histogram

plt.hist(cars_data['KM'], color='blue', edgecolor='white', bins=5)

plt.title("Histogram of Kilometer run")

plt.xlabel('Kilometers')

plt.ylabel('Frequency')

plt.show()

# Bar Plot

fuelTypes = ('Petrol', 'Diesel', 'CNG')

counts = cars_data['FuelType'].value_counts()

index = np.arange(len(fuelTypes))

plt.bar(index, counts, color=['red', 'green', 'cyan'])

plt.title("Bar Plot of Fuel Type")

plt.xlabel('Fuel Used')

plt.ylabel('Frequency')

plt.xticks(index, fuelTypes, rotation=90)

plt.show()

# Scatter Plot using seaborn

sns.set(style='darkgrid')

sns.regplot(x=cars_data['Age'], y=cars_data['Price'])

plt.show()

# Histogram using seaborn

sns.displot(cars_data['Age'])

plt.show()

# Bar Plot using seaborn

sns.countplot(x='FuelType', data=cars_data)

plt.show()

# Box Plot using seaborn

sns.boxplot(y=cars_data['Price'])

plt.show()

sns.boxplot(y=cars_data['Price'], x=cars_data['FuelType'])

plt.show()

ENGINEERING HUB

DSBDA

Post a Comment

0 Comments

Popular Posts

SE O.O.PROGRAMMING MCQ

IT SELECT YOUR SUBJECT

SE LDCO MCQ

Categories

Report Abuse

FIND MCQs AND NOTES

PLEASE SELECT YOUR BRANCH

Search

Labels

Footer Menu Widget

ENGINEERING HUB

DSBDA

Post a Comment

0 Comments

Social Plugin

Popular Posts

SE O.O.PROGRAMMING MCQ

IT SELECT YOUR SUBJECT

SE LDCO MCQ

Categories

Report Abuse

FIND MCQs AND NOTES

PLEASE SELECT YOUR BRANCH

Search

Labels

Footer Menu Widget