updates
This commit is contained in:
56
auto24.py
56
auto24.py
@@ -9,8 +9,10 @@ import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
import re
|
||||
import json
|
||||
|
||||
from urllib.request import urlopen, Request
|
||||
|
||||
from urllib.request import urlopen
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
@@ -23,27 +25,30 @@ def get_car_features(element, mylist):
|
||||
|
||||
"""Returns the car's features as nested list"""
|
||||
|
||||
car_instance = element.findChild('td',attrs={"class":"make_and_model"})
|
||||
car_instance = element.findChild('div',attrs={"class":"description"})
|
||||
|
||||
#Find ad's id
|
||||
ad_link = car_instance.findChild('a')
|
||||
mylist[0].append(str(ad_link.attrs).split('/')[2].strip("}").strip("'"))
|
||||
ad_link = element.findChild('a',attrs={"class":"row-link"})
|
||||
id_value = str(ad_link.attrs['href']).split('/')[2].strip("}").strip("'")
|
||||
mylist[0].append(id_value)
|
||||
|
||||
#Find car description
|
||||
mylist[1].append(ad_link.get_text())
|
||||
mylist[1].append(car_instance.findChild('a',attrs={"class":"main"}).get_text())
|
||||
|
||||
#Production year
|
||||
year_td = car_instance.find_next_sibling('td',attrs={'class':'year'})
|
||||
year_td = car_instance.findChild('span',attrs={'class':'year'})
|
||||
mylist[2].append(year_td.get_text())
|
||||
|
||||
#Fuel
|
||||
mylist[3].append(car_instance.find_next_sibling('td',attrs={'class':'fuel'}).get_text())
|
||||
mylist[3].append(car_instance.findChild('span',attrs={'class':'fuel'}).get_text())
|
||||
|
||||
#Transmission
|
||||
mylist[4].append(car_instance.find_next_sibling('td',attrs={'class':'transmission'}).get_text())
|
||||
mylist[4].append(car_instance.findChild('span',attrs={'class':'transmission'}).get_text())
|
||||
|
||||
#Price
|
||||
mylist[5].append(car_instance.find_next_sibling('td',attrs={'class':'price'}).get_text())
|
||||
price = car_instance.findChild('span',attrs={'class':'price'})
|
||||
print(price)
|
||||
mylist[5].append(price.get_text() if price else '')
|
||||
|
||||
|
||||
# In[374]:
|
||||
@@ -61,8 +66,8 @@ def scrape_page(url):
|
||||
[], # transmission
|
||||
[], # Price
|
||||
]
|
||||
mytable = soup.find('table',attrs={'id':'usedVehiclesSearchResult'})
|
||||
result_rows = mytable.findChildren('tr',attrs={'class':re.compile(fr"result-row item-.")})
|
||||
mytable = soup.find('div',attrs={'id':'usedVehiclesSearchResult-flex'})
|
||||
result_rows = mytable.findChildren('div',attrs={'class':re.compile(fr"result-row item-.")})
|
||||
for row in result_rows:
|
||||
get_car_features(row,mylist)
|
||||
df = pd.DataFrame(mylist)
|
||||
@@ -90,18 +95,30 @@ import time
|
||||
timeout = time.time() + 60*1
|
||||
dataframes=[]
|
||||
|
||||
#url = "https://www.auto24.ee/kasutatud/nimekiri.php?bn=2&a=100&aj=&ssid=46274853&b=4&bw=21&ae=8&af=200&otsi=otsi"
|
||||
url = "https://www.auto24.ee/kasutatud/nimekiri.php?bn=2&a=101&aj=&f1=2018&g2=12000&k1=60&ae=2&af=50&ag=0&ag=1&otsi=otsi"
|
||||
# driver = webdriver.Firefox()
|
||||
# driver.get(url)
|
||||
hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.80 Safari/537.36',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
|
||||
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
|
||||
'Accept-Encoding': 'none',
|
||||
'Accept-Language': 'en-US,en;q=0.9,et;q=0.8',
|
||||
'Connection': 'keep-alive',
|
||||
# 'Cookie': 'CID=1621884104633711; my_searches_notif=1; PHPSESSID=sc66u21lot8te95edda73985tu; OptanonAlertBoxClosed=2022-01-26T10:18:23.206Z; __utma=13167336.877242740.1643192318.1643192318.1643192318.1; __utmc=13167336; __utmz=13167336.1643192318.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __cf_bm=QLTYnfz3iFPTGXscwrRdyJrkacwt8enoFU4EqQ4w33E-1644415671-0-AVspsg+4OGWPbPvqTxKtvECVzzHBpAluh0vUtjvbrznme9mZ7QRZP5mO28qeaeZ3rvWRlU8ES4oxAzZ+Z8Wy1S4=; OptanonConsent=isIABGlobal=false&datestamp=Wed+Feb+09+2022+16%3A11%3A38+GMT%2B0200+(Eastern+European+Standard+Time)&version=6.16.0&hosts=&landingPath=NotLandingPage&groups=C0004%3A1%2CC0003%3A1%2CC0002%3A1%2CC0001%3A1&AwaitingReconsent=false&geolocation=%3B',
|
||||
# 'Cookie': 'CID=1621884104633711; my_searches_notif=1; PHPSESSID=sc66u21lot8te95edda73985tu; OptanonAlertBoxClosed=2022-01-26T10:18:23.206Z; __utma=13167336.877242740.1643192318.1643192318.1643192318.1; __utmc=13167336; __utmz=13167336.1643192318.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); OptanonConsent=isIABGlobal=false&datestamp=Wed+Feb+09+2022+16%3A19%3A48+GMT%2B0200+(Eastern+European+Standard+Time)&version=6.16.0&hosts=&landingPath=NotLandingPage&groups=C0004%3A1%2CC0003%3A1%2CC0002%3A1%2CC0001%3A1&AwaitingReconsent=false&geolocation=%3B; __cf_bm=cp_7820d1SeeDxv37StQH7HMg0KZgMVEHbcD_JFYmQ0-1644416588-0-AWYK6qWmiAJwRtqvbtwuBgDJU2FauySHu6Mn9R6vYiUIrvWkcgs1khFFJcpvWmFP9o9m1LkwHNRsRSvgFPTo/bY=',
|
||||
'Cookie': 'CID=1621884104633711; my_searches_notif=1; PHPSESSID=sc66u21lot8te95edda73985tu; OptanonAlertBoxClosed=2022-01-26T10:18:23.206Z; __utma=13167336.877242740.1643192318.1643192318.1643192318.1; __utmc=13167336; __utmz=13167336.1643192318.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __cf_bm=cp_7820d1SeeDxv37StQH7HMg0KZgMVEHbcD_JFYmQ0-1644416588-0-AWYK6qWmiAJwRtqvbtwuBgDJU2FauySHu6Mn9R6vYiUIrvWkcgs1khFFJcpvWmFP9o9m1LkwHNRsRSvgFPTo/bY=; OptanonConsent=isIABGlobal=false&datestamp=Wed+Feb+09+2022+16%3A27%3A50+GMT%2B0200+(Eastern+European+Standard+Time)&version=6.16.0&hosts=&landingPath=NotLandingPage&groups=C0004%3A1%2CC0003%3A1%2CC0002%3A1%2CC0001%3A1&AwaitingReconsent=false&geolocation=%3B',
|
||||
'Referer': 'https://www.auto24.ee/kasutatud/nimekiri.php?bn=2&a=100&b=4&bw=21&ae=8&af=50&ssid=46274853&ak=50'}
|
||||
autoreq = Request(url, headers=hdr)
|
||||
|
||||
#nextlink = soup.find('a',attrs={'class':"input-link item","rel":"next"})
|
||||
#url = "https://www.auto24.ee/" + nextlink.attrs.get('href')
|
||||
|
||||
has_nextpage=True
|
||||
while(has_nextpage):
|
||||
dataframes.append(scrape_page(url))
|
||||
dataframes.append(scrape_page(autoreq))
|
||||
|
||||
html = urlopen(url)
|
||||
html = urlopen(autoreq)
|
||||
soup = BeautifulSoup(html, 'lxml')
|
||||
|
||||
nextlink = soup.find('a',attrs={'class':"input-link item","rel":"next"})
|
||||
@@ -138,7 +155,7 @@ type(df_all.iloc[0,0])
|
||||
|
||||
def scrape_detailview(id_string, list_in):
|
||||
url = 'https://www.auto24.ee/used/' + id_string
|
||||
html = urlopen(url)
|
||||
html = urlopen(Request(url, headers=hdr))
|
||||
soup = BeautifulSoup(html, 'lxml')
|
||||
|
||||
list_in[0].append(str(id_string))
|
||||
@@ -246,9 +263,9 @@ df['km'] = pd.to_numeric(df['km'])
|
||||
|
||||
mypattern = re.compile('(^\d{2,3} ?\d*)')
|
||||
df['price']= df['price'].apply(lambda x: unicodedata.normalize("NFKD",str(x)))
|
||||
df['price'] = df['price'].apply(lambda x: mypattern.findall(x)[0].replace(" ",""))
|
||||
df['price']=pd.to_numeric(df['price'])
|
||||
df['price'].unique()
|
||||
# df['price'] = df['price'].apply(lambda x: mypattern.findall(x)[0].replace(" ",""))
|
||||
#df['price']=pd.to_numeric(df['price'])
|
||||
#df['price'].unique()
|
||||
|
||||
|
||||
# In[ ]:
|
||||
@@ -289,3 +306,8 @@ df[(df['year']==2009)
|
||||
|
||||
df[['manufacturer','model']].value_counts()
|
||||
|
||||
|
||||
filename = 'file.json'
|
||||
|
||||
with open(filename, 'w') as file:
|
||||
json.dump(df.to_json(), file)
|
||||
|
||||
5
requirements.txt
Normal file
5
requirements.txt
Normal file
@@ -0,0 +1,5 @@
|
||||
beautifulsoup4==4.10.0
|
||||
matplotlib==3.3.4
|
||||
numpy==1.19.5
|
||||
pandas==1.3.0
|
||||
seaborn==0.11.2
|
||||
Reference in New Issue
Block a user