ETL (Extraction, Transformation, Loading) on Toyota’s Carlist in Malaysia by using Python

import requestsfrom bs4 import BeautifulSoupURL = ‘https://newcar.carlist.my/search/toyota’page = requests.get(URL)soup = BeautifulSoup(page.content, ‘html.parser’)
elemes = soup.find_all(‘div’, class_=’car-characteristics’)elements = soup.find_all(‘div’, class_=’car-information’)data = []data2 = []for element in elements:name_element = element.find(‘div’, class_=’car-information__description’)if name_element:name = element.find(‘a’)price = element.find(‘span’, class_=’text-muted total-price pull-left’)data2.append([name.text.strip(), price.text.strip()])for eleme in elemes:desc_eleme = eleme.find_all(‘div’, class_=’listing-specs-item’)body = desc_eleme[0]trans = desc_eleme[1]engine = desc_eleme[2]fuel = desc_eleme[3]seat = desc_eleme[4]data.append([body.text.strip(), trans.text.strip(), engine.text.strip(), fuel.text.strip(), seat.text.strip()])
import pandas as pd
df1 = pd.DataFrame(data2, columns=[‘Name’, ‘Price (RM)’])df2 = pd.DataFrame(data, columns=[‘Body Type’, ‘Transmission’, ‘Engine (cc)’, ‘Fuel Type’, ‘Seat Capacity’])
df = pd.concat([df1, df2], axis=1)df
df[‘Name’]=df[‘Name’].str.replace(‘2020 Toyota’, ‘’)df[‘Price (RM)’]=df[‘Price (RM)’].str.replace(‘RM’,’’).str.replace(‘,’,’’)df[‘Body Type’]=df[‘Body Type’].str.replace(‘Body Type:’, ‘’)df[‘Transmission’]=df[‘Transmission’].str.replace(‘Transmission:’, ‘’)df[‘Engine (cc)’]=df[‘Engine (cc)’].str.replace(‘Engine:’, ‘’).str.replace(‘ cc’, ‘’).str.replace(‘1329 -’,’’).str.replace(‘2393 -’, ‘’).str.replace(‘1987 -’, ‘’)df[‘Fuel Type’]=df[‘Fuel Type’].str.replace(‘Fuel Type:’, ‘’)df[‘Seat Capacity’]=df[‘Seat Capacity’].str.replace(‘Seat Capacity:’, ‘’).str.replace(‘2 -’,’’).str.replace(‘7 -’, ‘’)df
df[[‘Price (RM)’, ‘Engine (cc)’,’Seat Capacity’]].dtypes
df[‘Price (RM)’] = df[‘Price (RM)’].astype(int)df[‘Engine (cc)’] = df[‘Engine (cc)’].astype(int)df[‘Seat Capacity’] = df[‘Seat Capacity’].astype(int)df[[‘Price (RM)’, ‘Engine (cc)’,’Seat Capacity’]].dtypes
df[[‘Name’, ‘Body Type’, ‘Transmission’]]
df[‘Name’] = df[‘Name’].str.lower()df[‘Body Type’] = df[‘Body Type’].str.lower()df[‘Transmission’] = df[‘Transmission’].str.lower()df[[‘Name’, ‘Body Type’, ‘Transmission’]]
df[‘Engine (cc)’]
df[‘Engine (cc)’] = df[‘Engine (cc)’].replace([1496, 1798, 1998, 2393, 2487, 2694], [‘low’, ‘low-medium’, ‘medium’, ‘medium-high’, ‘high’,’highest’])df[‘Engine (cc)’]
%matplotlib inlineimport matplotlib.pyplot as pltlabels = ‘low’, ‘low-medium’, ‘medium’, ‘medium-high’, ‘high’,’highest’sizes = [4, 2, 1, 1, 1, 1]plt.pie(sizes, labels=labels, autopct=’%1.1f%%’, shadow=True, startangle=0)plt.axis(‘equal’)plt.title(‘Percentage of Level of Engine (cc) in Toyota Car’)plt.show()
df[[‘Fuel Type’]]
pd.get_dummies(df[‘Fuel Type’])
%matplotlib inlineimport matplotlib.pyplot as pltdf[‘Fuel Type’].value_counts().plot(kind=’bar’ , figsize=(10, 6), color=’green’)plt.title(‘Histogram of Fuel Type on Toyota Car’)plt.xlabel(‘Fuel Type’)plt.ylabel(‘Number of Toyota Car’)
df.to_csv(‘Carlist.csv’, index=False)

--

--

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store