This commit is contained in:
2026-02-01 14:51:55 +02:00
parent 868d4dab21
commit 55c5fd7807
3 changed files with 45 additions and 17 deletions

View File

@@ -9,8 +9,10 @@ import numpy as np
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import seaborn as sns import seaborn as sns
import re import re
import json
from urllib.request import urlopen, Request
from urllib.request import urlopen
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
@@ -23,27 +25,30 @@ def get_car_features(element, mylist):
"""Returns the car's features as nested list""" """Returns the car's features as nested list"""
car_instance = element.findChild('td',attrs={"class":"make_and_model"}) car_instance = element.findChild('div',attrs={"class":"description"})
#Find ad's id #Find ad's id
ad_link = car_instance.findChild('a') ad_link = element.findChild('a',attrs={"class":"row-link"})
mylist[0].append(str(ad_link.attrs).split('/')[2].strip("}").strip("'")) id_value = str(ad_link.attrs['href']).split('/')[2].strip("}").strip("'")
mylist[0].append(id_value)
#Find car description #Find car description
mylist[1].append(ad_link.get_text()) mylist[1].append(car_instance.findChild('a',attrs={"class":"main"}).get_text())
#Production year #Production year
year_td = car_instance.find_next_sibling('td',attrs={'class':'year'}) year_td = car_instance.findChild('span',attrs={'class':'year'})
mylist[2].append(year_td.get_text()) mylist[2].append(year_td.get_text())
#Fuel #Fuel
mylist[3].append(car_instance.find_next_sibling('td',attrs={'class':'fuel'}).get_text()) mylist[3].append(car_instance.findChild('span',attrs={'class':'fuel'}).get_text())
#Transmission #Transmission
mylist[4].append(car_instance.find_next_sibling('td',attrs={'class':'transmission'}).get_text()) mylist[4].append(car_instance.findChild('span',attrs={'class':'transmission'}).get_text())
#Price #Price
mylist[5].append(car_instance.find_next_sibling('td',attrs={'class':'price'}).get_text()) price = car_instance.findChild('span',attrs={'class':'price'})
print(price)
mylist[5].append(price.get_text() if price else '')
# In[374]: # In[374]:
@@ -61,8 +66,8 @@ def scrape_page(url):
[], # transmission [], # transmission
[], # Price [], # Price
] ]
mytable = soup.find('table',attrs={'id':'usedVehiclesSearchResult'}) mytable = soup.find('div',attrs={'id':'usedVehiclesSearchResult-flex'})
result_rows = mytable.findChildren('tr',attrs={'class':re.compile(fr"result-row item-.")}) result_rows = mytable.findChildren('div',attrs={'class':re.compile(fr"result-row item-.")})
for row in result_rows: for row in result_rows:
get_car_features(row,mylist) get_car_features(row,mylist)
df = pd.DataFrame(mylist) df = pd.DataFrame(mylist)
@@ -90,18 +95,30 @@ import time
timeout = time.time() + 60*1 timeout = time.time() + 60*1
dataframes=[] dataframes=[]
#url = "https://www.auto24.ee/kasutatud/nimekiri.php?bn=2&a=100&aj=&ssid=46274853&b=4&bw=21&ae=8&af=200&otsi=otsi"
url = "https://www.auto24.ee/kasutatud/nimekiri.php?bn=2&a=101&aj=&f1=2018&g2=12000&k1=60&ae=2&af=50&ag=0&ag=1&otsi=otsi" url = "https://www.auto24.ee/kasutatud/nimekiri.php?bn=2&a=101&aj=&f1=2018&g2=12000&k1=60&ae=2&af=50&ag=0&ag=1&otsi=otsi"
# driver = webdriver.Firefox() # driver = webdriver.Firefox()
# driver.get(url) # driver.get(url)
hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.80 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en;q=0.9,et;q=0.8',
'Connection': 'keep-alive',
# 'Cookie': 'CID=1621884104633711; my_searches_notif=1; PHPSESSID=sc66u21lot8te95edda73985tu; OptanonAlertBoxClosed=2022-01-26T10:18:23.206Z; __utma=13167336.877242740.1643192318.1643192318.1643192318.1; __utmc=13167336; __utmz=13167336.1643192318.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __cf_bm=QLTYnfz3iFPTGXscwrRdyJrkacwt8enoFU4EqQ4w33E-1644415671-0-AVspsg+4OGWPbPvqTxKtvECVzzHBpAluh0vUtjvbrznme9mZ7QRZP5mO28qeaeZ3rvWRlU8ES4oxAzZ+Z8Wy1S4=; OptanonConsent=isIABGlobal=false&datestamp=Wed+Feb+09+2022+16%3A11%3A38+GMT%2B0200+(Eastern+European+Standard+Time)&version=6.16.0&hosts=&landingPath=NotLandingPage&groups=C0004%3A1%2CC0003%3A1%2CC0002%3A1%2CC0001%3A1&AwaitingReconsent=false&geolocation=%3B',
# 'Cookie': 'CID=1621884104633711; my_searches_notif=1; PHPSESSID=sc66u21lot8te95edda73985tu; OptanonAlertBoxClosed=2022-01-26T10:18:23.206Z; __utma=13167336.877242740.1643192318.1643192318.1643192318.1; __utmc=13167336; __utmz=13167336.1643192318.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); OptanonConsent=isIABGlobal=false&datestamp=Wed+Feb+09+2022+16%3A19%3A48+GMT%2B0200+(Eastern+European+Standard+Time)&version=6.16.0&hosts=&landingPath=NotLandingPage&groups=C0004%3A1%2CC0003%3A1%2CC0002%3A1%2CC0001%3A1&AwaitingReconsent=false&geolocation=%3B; __cf_bm=cp_7820d1SeeDxv37StQH7HMg0KZgMVEHbcD_JFYmQ0-1644416588-0-AWYK6qWmiAJwRtqvbtwuBgDJU2FauySHu6Mn9R6vYiUIrvWkcgs1khFFJcpvWmFP9o9m1LkwHNRsRSvgFPTo/bY=',
'Cookie': 'CID=1621884104633711; my_searches_notif=1; PHPSESSID=sc66u21lot8te95edda73985tu; OptanonAlertBoxClosed=2022-01-26T10:18:23.206Z; __utma=13167336.877242740.1643192318.1643192318.1643192318.1; __utmc=13167336; __utmz=13167336.1643192318.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __cf_bm=cp_7820d1SeeDxv37StQH7HMg0KZgMVEHbcD_JFYmQ0-1644416588-0-AWYK6qWmiAJwRtqvbtwuBgDJU2FauySHu6Mn9R6vYiUIrvWkcgs1khFFJcpvWmFP9o9m1LkwHNRsRSvgFPTo/bY=; OptanonConsent=isIABGlobal=false&datestamp=Wed+Feb+09+2022+16%3A27%3A50+GMT%2B0200+(Eastern+European+Standard+Time)&version=6.16.0&hosts=&landingPath=NotLandingPage&groups=C0004%3A1%2CC0003%3A1%2CC0002%3A1%2CC0001%3A1&AwaitingReconsent=false&geolocation=%3B',
'Referer': 'https://www.auto24.ee/kasutatud/nimekiri.php?bn=2&a=100&b=4&bw=21&ae=8&af=50&ssid=46274853&ak=50'}
autoreq = Request(url, headers=hdr)
#nextlink = soup.find('a',attrs={'class':"input-link item","rel":"next"}) #nextlink = soup.find('a',attrs={'class':"input-link item","rel":"next"})
#url = "https://www.auto24.ee/" + nextlink.attrs.get('href') #url = "https://www.auto24.ee/" + nextlink.attrs.get('href')
has_nextpage=True has_nextpage=True
while(has_nextpage): while(has_nextpage):
dataframes.append(scrape_page(url)) dataframes.append(scrape_page(autoreq))
html = urlopen(url) html = urlopen(autoreq)
soup = BeautifulSoup(html, 'lxml') soup = BeautifulSoup(html, 'lxml')
nextlink = soup.find('a',attrs={'class':"input-link item","rel":"next"}) nextlink = soup.find('a',attrs={'class':"input-link item","rel":"next"})
@@ -138,7 +155,7 @@ type(df_all.iloc[0,0])
def scrape_detailview(id_string, list_in): def scrape_detailview(id_string, list_in):
url = 'https://www.auto24.ee/used/' + id_string url = 'https://www.auto24.ee/used/' + id_string
html = urlopen(url) html = urlopen(Request(url, headers=hdr))
soup = BeautifulSoup(html, 'lxml') soup = BeautifulSoup(html, 'lxml')
list_in[0].append(str(id_string)) list_in[0].append(str(id_string))
@@ -246,9 +263,9 @@ df['km'] = pd.to_numeric(df['km'])
mypattern = re.compile('(^\d{2,3} ?\d*)') mypattern = re.compile('(^\d{2,3} ?\d*)')
df['price']= df['price'].apply(lambda x: unicodedata.normalize("NFKD",str(x))) df['price']= df['price'].apply(lambda x: unicodedata.normalize("NFKD",str(x)))
df['price'] = df['price'].apply(lambda x: mypattern.findall(x)[0].replace(" ","")) # df['price'] = df['price'].apply(lambda x: mypattern.findall(x)[0].replace(" ",""))
df['price']=pd.to_numeric(df['price']) #df['price']=pd.to_numeric(df['price'])
df['price'].unique() #df['price'].unique()
# In[ ]: # In[ ]:
@@ -289,3 +306,8 @@ df[(df['year']==2009)
df[['manufacturer','model']].value_counts() df[['manufacturer','model']].value_counts()
filename = 'file.json'
with open(filename, 'w') as file:
json.dump(df.to_json(), file)

1
file.json Normal file

File diff suppressed because one or more lines are too long

5
requirements.txt Normal file
View File

@@ -0,0 +1,5 @@
beautifulsoup4==4.10.0
matplotlib==3.3.4
numpy==1.19.5
pandas==1.3.0
seaborn==0.11.2