From 3b2be913eca4428fd4611c19fa0e254f93bd448b Mon Sep 17 00:00:00 2001 From: Tarmo Date: Sun, 2 Dec 2018 15:11:02 +0200 Subject: [PATCH] commiting 'auto24.py' --- auto24.py | 291 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 291 insertions(+) create mode 100644 auto24.py diff --git a/auto24.py b/auto24.py new file mode 100644 index 0000000..fcfbd49 --- /dev/null +++ b/auto24.py @@ -0,0 +1,291 @@ + +# coding: utf-8 + +# In[372]: + + +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import seaborn as sns +import re + +from urllib.request import urlopen +from bs4 import BeautifulSoup + + +# **Scrape results list excluding detailviews** + +# In[373]: + + +def get_car_features(element, mylist): + + """Returns the car's features as nested list""" + + car_instance = element.findChild('td',attrs={"class":"make_and_model"}) + + #Find ad's id + ad_link = car_instance.findChild('a') + mylist[0].append(str(ad_link.attrs).split('/')[2].strip("}").strip("'")) + + #Find car description + mylist[1].append(ad_link.get_text()) + + #Production year + year_td = car_instance.find_next_sibling('td',attrs={'class':'year'}) + mylist[2].append(year_td.get_text()) + + #Fuel + mylist[3].append(car_instance.find_next_sibling('td',attrs={'class':'fuel'}).get_text()) + + #Transmission + mylist[4].append(car_instance.find_next_sibling('td',attrs={'class':'transmission'}).get_text()) + + #Price + mylist[5].append(car_instance.find_next_sibling('td',attrs={'class':'price'}).get_text()) + + +# In[374]: + + +def scrape_page(url): + + html = urlopen(url) + soup = BeautifulSoup(html, 'lxml') + + mylist=[[], # id + [], # car description + [], # production year + [], # fuel + [], # transmission + [], # Price + ] + mytable = soup.find('table',attrs={'id':'usedVehiclesSearchResult'}) + result_rows = mytable.findChildren('tr',attrs={'class':re.compile(fr"result-row item-.")}) + for row in result_rows: + get_car_features(row,mylist) + df = pd.DataFrame(mylist) + transposed = df.transpose() + return transposed + + +# In[375]: + + +def click_next(driver): + try: + myelement = WebDriverWait(driver, 10).until( + EC.presence_of_element_located((By.CLASS_NAME, "next-page")) + ) + myelement.click() + except: + print('Did not find the element.') + + +# In[376]: + + +import time +timeout = time.time() + 60*1 +dataframes=[] + +url = "https://www.auto24.ee/kasutatud/nimekiri.php?bn=2&a=101&aj=&f1=2018&g2=12000&k1=60&ae=2&af=50&ag=0&ag=1&otsi=otsi" +# driver = webdriver.Firefox() +# driver.get(url) + +#nextlink = soup.find('a',attrs={'class':"input-link item","rel":"next"}) +#url = "https://www.auto24.ee/" + nextlink.attrs.get('href') + +has_nextpage=True +while(has_nextpage): + dataframes.append(scrape_page(url)) + + html = urlopen(url) + soup = BeautifulSoup(html, 'lxml') + + nextlink = soup.find('a',attrs={'class':"input-link item","rel":"next"}) + has_nextpage = bool(nextlink) + if has_nextpage: + url = "https://www.auto24.ee/" + nextlink.attrs.get('href') + + +# while True: +# dataframes.append(scrape_page(soup) +# if bool(not soup.find('div',attrs={'class':'next-page'})) or time.time() > timeout: +# break + +# click_next(driver) + + +# In[377]: + + +df_all = pd.concat(dataframes,ignore_index=True) +df_all.columns = ['id', 'description', 'year','fuel','gearbox','price'] +df_all['id'] = pd.to_numeric(df_all['id']) +df_all.dtypes + + +# In[378]: + + +type(df_all.iloc[0,0]) + + +# In[379]: + + +def scrape_detailview(id_string, list_in): + url = 'https://www.auto24.ee/used/' + id_string + html = urlopen(url) + soup = BeautifulSoup(html, 'lxml') + + list_in[0].append(str(id_string)) + body_type = np.nan + manuf_date = np.nan + km = np.nan + + main_data = soup.find('table',attrs={"class":"section main-data"}) + + if main_data: + tr_body = main_data.findChild('tr',attrs={'class':'field-keretyyp'}) + tr_date = main_data.findChild('tr',attrs={'class':'field-month_and_year'}) + tr_km = main_data.findChild('tr',attrs={'class':'field-labisoit'}) + + body_type = tr_body.findChild('span',attrs={'class':'value'}).get_text() if tr_body else body_type + manuf_date = tr_date.findChild('span',attrs={'class':'value'}).get_text() if tr_date else manuf_date + km = tr_km.findChild('span',attrs={'class':'value'}).get_text() if tr_km else km + + list_in[1].append(body_type) + list_in[2].append(manuf_date) + list_in[3].append(km) + + +# In[380]: + + +adslist = [[], #0 id + [], #1 body_type + [], #2 manufacturing date + [] #3 kilometers driven + ] + +for adid in list(df_all['id']): + scrape_detailview(str(adid), adslist) + + +# In[381]: + + +pd.options.display.max_rows = 999 +df_ads = pd.DataFrame(adslist).transpose() +df_ads.columns = ['id','body','man_date','km'] +df_ads['id'] = pd.to_numeric(df_ads['id']) +df_ads.head() + + +# In[382]: + + + +df_all.head() +# df_all.drop_duplicates(subset=['id'],inplace=True) +# df_ads.drop_duplicates(subset=['id'],inplace=True) + + +# In[383]: + + +df = df_ads.merge(df_all,how='left',on=['id']) + + +# In[384]: + + +df.head() + + +# In[385]: + + +df['model'] = df['description'].apply(lambda x: x.split(' ')[1]) +df['manufacturer'] = df['description'].apply(lambda x: x.split()[0]) + +df.head(3) + + +# In[390]: + + +mypattern = re.compile('(^\d{2,3} ?\d*)') + + +# In[391]: + + +matchobject = mypattern.findall('19 000 sis. KM')[0] +entries = re.search('(^\d{2,3} ?\d{3})','11 900 sis. KM') +#entries.group(0) +(matchobject) + + +# In[392]: + + +import unicodedata +#new_str = unicodedata.normalize("NFKD", unicode_str) +df['km'].fillna("0", inplace=True) +df['km']= df['km'].apply(lambda x: unicodedata.normalize("NFKD",str(x))) +df['km']= df['km'].apply(lambda x: str(x).strip(' km').replace(' ','')) +df['km'] = pd.to_numeric(df['km']) + + +# In[401]: + + +mypattern = re.compile('(^\d{2,3} ?\d*)') +df['price']= df['price'].apply(lambda x: unicodedata.normalize("NFKD",str(x))) +df['price'] = df['price'].apply(lambda x: mypattern.findall(x)[0].replace(" ","")) +df['price']=pd.to_numeric(df['price']) +df['price'].unique() + + +# In[ ]: + + +df['year'] = pd.to_numeric(df['year']) +df.dtypes + + +# In[ ]: + + +df.sort_values('description', inplace=True) +df.head() + + +# In[ ]: + + +df[df['manufacturer']=='Volkswagen'] + + +# In[ ]: + + +df[df['year']==2011].groupby(['manufacturer','model']).count() + + +# In[ ]: + + +df[(df['year']==2009) + & (df['manufacturer']=='Ford')]['km'].plot.hist(bins=20) + + +# In[ ]: + + +df[['manufacturer','model']].value_counts() +