Initial commit
This commit is contained in:
38
scraper_utilities/angolajsontest.py
Normal file
38
scraper_utilities/angolajsontest.py
Normal file
@@ -0,0 +1,38 @@
|
||||
"""
|
||||
Used for parsing saved json response from angola-based websites
|
||||
"""
|
||||
|
||||
import json
|
||||
|
||||
with open('testing.json', 'r') as f:
|
||||
st = f.read()
|
||||
|
||||
decoded = json.loads(st)
|
||||
|
||||
products = decoded['hits']
|
||||
|
||||
for product in products:
|
||||
bad_conditions = [
|
||||
product['product_type'].lower() in ["gift card", "music"],
|
||||
not product['product_published'],
|
||||
'product' not in product
|
||||
]
|
||||
if any(bad_conditions):
|
||||
continue
|
||||
|
||||
if 'product' not in product:
|
||||
print(json.dumps(product))
|
||||
break
|
||||
|
||||
item_info = {
|
||||
"ShopSourceID": 1,
|
||||
"Url": 'https://www.jbhifi.com.au/products/' + product['handle'],
|
||||
"BaseUrl": "jbhifi.com.au",
|
||||
"SKU": product['sku'],
|
||||
"MPN": product['product']['supplierProductDetails'][0]['supplierStockCode'],
|
||||
"Model": product['product']['model'],
|
||||
"Title": product['title'],
|
||||
"PriceIncTax": product['price'],
|
||||
"IsInStock": product['availability']['canBuyOnline'],
|
||||
"QuantityAvailable": None
|
||||
}
|
||||
46
scraper_utilities/angolatesting.py
Normal file
46
scraper_utilities/angolatesting.py
Normal file
@@ -0,0 +1,46 @@
|
||||
"""
|
||||
Used for attempting to get data from angola-based websites.
|
||||
|
||||
"""
|
||||
|
||||
import requests
|
||||
import re
|
||||
import json
|
||||
from scrapy import Selector
|
||||
|
||||
|
||||
|
||||
xhr_headers = {
|
||||
'Accept': 'application/json, text/javascript, */*; q=0.01',
|
||||
'content-type': 'application/json',
|
||||
'Accept-Encoding': 'gzip, deflate, br',
|
||||
'Connection': 'keep-alive',
|
||||
'X-Requested-With': 'XMLHttpRequest',
|
||||
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/88.0',
|
||||
'x-algolia-api-key': 'YmE4MTVkMzc1YmU4ZDcxM2QxNTMzNDZlMzVlZjBjMTk4MmM5ZWU0NjBlN2I0NjE2NTk1M2VjZDg3MzQ1YjVmMXRhZ0ZpbHRlcnM9',
|
||||
'x-algolia-application-id': '3QIRNP5HAI'
|
||||
}
|
||||
|
||||
|
||||
url = 'https://3qirnp5hai-2.algolianet.com/1/indexes/*/queries?x-algolia-agent=Algolia for vanilla JavaScript (lite) 3.27.0;instantsearch.js 2.10.2;Magento2 integration (1.11.3);JS Helper 2.26.0'
|
||||
|
||||
# reqs = {
|
||||
# "query": "",
|
||||
# "hitsPerPage": 100,
|
||||
# "page": 0,
|
||||
# "filters": "product_published = 1 AND availability.displayProduct = 1"
|
||||
# }
|
||||
|
||||
reqs = {
|
||||
"requests": [
|
||||
{
|
||||
"indexName": "production_default_products_price_default_desc",
|
||||
"params": "query=&hitsPerPage=1000&filters=price.AUD.default < 500"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
r = requests.post(url, json.dumps(reqs), headers=xhr_headers)
|
||||
|
||||
with open("testing.json", "w") as f:
|
||||
f.write(r.text)
|
||||
48
scraper_utilities/cookies.py
Normal file
48
scraper_utilities/cookies.py
Normal file
@@ -0,0 +1,48 @@
|
||||
"""
|
||||
Used for capturing cookies after a request to specific website
|
||||
"""
|
||||
|
||||
|
||||
from time import sleep
|
||||
import json
|
||||
from selenium import webdriver
|
||||
|
||||
|
||||
def check_captcha(driver, check_string):
|
||||
print('Checking for captcha')
|
||||
while check_string not in driver.page_source:
|
||||
print('Found captcha. Waiting.')
|
||||
sleep(5)
|
||||
print('Captcha not found, proceeding')
|
||||
|
||||
|
||||
def main():
|
||||
SHOP_LINK = 'https://www.mwave.com.au/'
|
||||
driver = webdriver.Firefox()
|
||||
driver.get(SHOP_LINK)
|
||||
|
||||
# check_captcha(driver, 'Your current connection has triggered our security challenge, please complete the chall')
|
||||
|
||||
cookies = driver.get_cookies()
|
||||
|
||||
user_agent = driver.execute_script("return navigator.userAgent;")
|
||||
print(user_agent)
|
||||
|
||||
# driver.find_element_by_tag_name('body').send_keys(Keys.COMMAND + 't')
|
||||
# driver.get(l)
|
||||
driver.close()
|
||||
|
||||
return cookies
|
||||
|
||||
|
||||
def decode_cookies(cookies):
|
||||
cookies_dict = {}
|
||||
for i in cookies:
|
||||
cookies_dict[i['name']] = i['value']
|
||||
otp = json.dumps(cookies_dict)
|
||||
with open('cookies.json', 'w') as f:
|
||||
f.write(otp)
|
||||
|
||||
|
||||
c = main()
|
||||
decode_cookies(c)
|
||||
21
scraper_utilities/htmltesting.py
Normal file
21
scraper_utilities/htmltesting.py
Normal file
@@ -0,0 +1,21 @@
|
||||
"""
|
||||
Used for writing parsing scripts using saved html file
|
||||
Allows parsing a page without requesting the page each time
|
||||
"""
|
||||
|
||||
from scrapy import Selector
|
||||
import json
|
||||
|
||||
|
||||
|
||||
with open('testing.html', 'r') as f:
|
||||
st = f.read()
|
||||
|
||||
response = Selector(text=st)
|
||||
|
||||
# ==================
|
||||
article_ids = response.css('article[id]::attr(id)').getall()
|
||||
|
||||
links = {'https://voice.mv/%s/' % x.split('-')[1] for x in article_ids}
|
||||
|
||||
print(links)
|
||||
13
scraper_utilities/jsontesting.py
Normal file
13
scraper_utilities/jsontesting.py
Normal file
@@ -0,0 +1,13 @@
|
||||
"""
|
||||
Used for writing parsing scripts using saved html file
|
||||
Allows parsing a page without requesting the api each time
|
||||
"""
|
||||
|
||||
import json
|
||||
|
||||
with open('testing.json', 'r') as f:
|
||||
st = f.read()
|
||||
|
||||
l = json.loads(st)
|
||||
|
||||
print(l)
|
||||
118
scraper_utilities/program.py
Normal file
118
scraper_utilities/program.py
Normal file
@@ -0,0 +1,118 @@
|
||||
import requests
|
||||
from scrapy import Selector
|
||||
import json
|
||||
|
||||
|
||||
headers = {
|
||||
"content-type": "application/json",
|
||||
"accept": "application/json",
|
||||
"referer": "https://www.causeiq.com/directory/business-and-community-development-organizations-list/",
|
||||
"origin": "https://www.causeiq.com",
|
||||
"sec-fetch-site": "same-origin",
|
||||
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36",
|
||||
}
|
||||
|
||||
payload = {
|
||||
"filters": [],
|
||||
"pageNumber": 1,
|
||||
"sortDir": "desc",
|
||||
"sortHow": "popularity"
|
||||
}
|
||||
|
||||
url = 'https://www.causeiq.com/directory/business-and-community-development-organizations-list/'
|
||||
|
||||
r_init = requests.get(url, headers=headers)
|
||||
cookies = r_init.cookies
|
||||
print(r_init.headers['set-cookie'])
|
||||
headers['x-csrftoken'] = cookies['csrftoken']
|
||||
|
||||
|
||||
stations = requests.get('https://www.causeiq.com/directory/retrieve_variable/metros/')
|
||||
stations = json.loads(stations.text).keys()
|
||||
|
||||
|
||||
def get_stations():
|
||||
print('Total stations: ', len(stations))
|
||||
|
||||
total_companies = 0
|
||||
for i in stations:
|
||||
filters = [{
|
||||
"hash": "1",
|
||||
"type": "metro",
|
||||
"value": str(i)
|
||||
}]
|
||||
payload['filters'] = filters
|
||||
r = requests.post(
|
||||
url,
|
||||
json.dumps(payload),
|
||||
headers=headers,
|
||||
cookies=cookies
|
||||
)
|
||||
|
||||
decoded = json.loads(r.text)
|
||||
total_companies += decoded['count']
|
||||
s = 'Got metro %s with %s companies. Current total: %s' % (i, decoded['count'], total_companies)
|
||||
|
||||
|
||||
def get_full_station(station):
|
||||
filters = [{
|
||||
"hash": "1",
|
||||
"type": "metro",
|
||||
"value": station
|
||||
}]
|
||||
payload['filters'] = filters
|
||||
r = requests.post(
|
||||
url,
|
||||
json.dumps(payload),
|
||||
headers=headers,
|
||||
cookies=cookies
|
||||
)
|
||||
decoded = json.loads(r.text)
|
||||
print(decoded['count'])
|
||||
|
||||
letters = 'abcdefghijklmnopqrstuvwxyz'
|
||||
|
||||
total = 0
|
||||
for l in letters:
|
||||
filters = [
|
||||
{
|
||||
"hash": "1",
|
||||
"type": "metro",
|
||||
"value": station
|
||||
},
|
||||
{
|
||||
"hash": "2",
|
||||
"type": "keywords",
|
||||
"value": l
|
||||
}
|
||||
]
|
||||
payload['filters'] = filters
|
||||
r = requests.post(
|
||||
url,
|
||||
json.dumps(payload),
|
||||
headers=headers,
|
||||
cookies=cookies
|
||||
)
|
||||
decoded = json.loads(r.text)
|
||||
total += decoded['count']
|
||||
print('Got %s companies from letter %s. Total: %s' % (decoded['count'], l, total))
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# print("one")
|
||||
|
||||
# print('two')
|
||||
# payload['pageNumber'] = 2
|
||||
# r = requests.post(
|
||||
# 'https://www.causeiq.com/directory/business-and-community-development-organizations-list/',
|
||||
# json.dumps(payload),
|
||||
# headers=headers,
|
||||
# cookies=cookies
|
||||
# )
|
||||
print("end")
|
||||
|
||||
# filename = 'responses.json'
|
||||
# with open(filename, 'w') as f:
|
||||
# f.write(r.text)
|
||||
27
scraper_utilities/request.py
Normal file
27
scraper_utilities/request.py
Normal file
@@ -0,0 +1,27 @@
|
||||
'''
|
||||
Used for requesting data from website.
|
||||
Generated files are targets for htmltesting/jsontesting scripts
|
||||
'''
|
||||
import requests
|
||||
import json
|
||||
|
||||
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
|
||||
'token': 'b5ozEsESGV/jmwjy0v2MjRTYLnCozdQG2Kt1GOqfJ/1pKFgOi/1u5lyID2ZGcZSN0jaLB4pFpbJ5kJaFaiXyZ09eF4H6GXK3/x5euFTCOvWc8rqx22knnnJMgJY4q/mBbnf2+oJ4G5p3FU+Am2kPVP70OJ+oS/Gv18GtDXbVxrKozOiwaNrF6O7oEVmldDENCl34N0d3Pl7f53cYGKBWArkieRgLjbrjkEU2hMS++vuT+1JIAmW45OKpw1oT2ueQORgmZ5yaSw6xxOFpwoMvjIXSas81yUKvykHDvRFFeTaIAW6lyyLpQ/TC2rzntea4ASwCmn8XiHs3lkwP6OEvaQ==',
|
||||
'origin': 'https://cryptoslam.io',
|
||||
'accept': 'application/json, text/javascript, */*; q=0.01'
|
||||
# 'Cookie': 'XSRF-TOKEN=eyJpdiI6IkhBamFqSmVsdUNoT1F4WkF0NXdxSEE9PSIsInZhbHVlIjoiKzdQN1diREYwanJiSGFBVTZDV0tlSzdPY1BYelBOWDNad1RXTjk1anJqUnpmK2lqdXJUd3Q2TkJPa1p2TmtKQiIsIm1hYyI6IjkzMmE4YmY0YTk0OGU4YWFhMTMxMDZjMTY1MzU0ZTA4NzAxOTI5MmVmNGJmMWZiNmE3YmQ5ZGE5NDVhMDA3YmIifQ==; expires=Mon, 13-Jun-2022 09:55:00 GMT; Max-Age=15600000; path=/, one_online_session=eyJpdiI6IjRMaW83bnNlajBidmNabFg4d3ErYkE9PSIsInZhbHVlIjoiXC9wdFNwdWdIOTZLaU9HMkU2dzFSU0huYTdPa3c0Q3BmbGp5RFRvcHEyS01nelk0R1JNczRcL0lUdzhRa211MTByIiwibWFjIjoiNzI3N2NkOGE2YzY3ODBkZDg1ZjA1NGJhZTAxNzcwNDQ4NDc5NDRlMzYwNmJkYzY0N2JlYTBhMjU2YTM1M2YzYiJ9; expires=Mon, 13-Jun-2022 09:55:00 GMT; Max-Age=15600000; path=/; httponly',
|
||||
}
|
||||
|
||||
url = 'https://adhadhu.com/'
|
||||
data = '''{"draw":1,"columns":[{"data":null,"name":"","searchable":true,"orderable":false,"search":{"value":"","regex":false}},{"data":null,"name":"TimeStamp","searchable":true,"orderable":true,"search":{"value":"","regex":false}},{"data":null,"name":"","searchable":true,"orderable":true,"search":{"value":"","regex":false}},{"data":null,"name":"","searchable":true,"orderable":true,"search":{"value":"","regex":false}},{"data":null,"name":"Tokens.Attributes.Background","searchable":true,"orderable":false,"search":{"value":"","regex":false}},{"data":null,"name":"PriceDoubleType","searchable":true,"orderable":true,"search":{"value":"","regex":false}},{"data":null,"name":"PriceUSDDoubleType","searchable":true,"orderable":true,"search":{"value":"","regex":false}},{"data":null,"name":"Tokens.Attributes","searchable":true,"orderable":true,"search":{"value":"","regex":false}},{"data":null,"name":"Tokens.Attributes.SerialNumber","searchable":true,"orderable":true,"search":{"value":"","regex":false}},{"data":null,"name":"","searchable":true,"orderable":true,"search":{"value":"","regex":false}},{"data":null,"name":"","searchable":true,"orderable":true,"search":{"value":"","regex":false}},{"data":null,"name":"","searchable":true,"orderable":false,"search":{"value":"","regex":false}}],"order":[{"column":1,"dir":"desc"}],"start":0,"length":50,"search":{"value":"","regex":false},"startDateHeader":"","endDateHeader":"","buyer":"","seller":"","attributesQuery":{},"marketplace":""}'''
|
||||
r = requests.post(url, data, headers=headers)
|
||||
# r = requests.get(url)
|
||||
print(r)
|
||||
print(r.text)
|
||||
|
||||
|
||||
filename = 'testing.html'
|
||||
with open(filename, 'w') as f:
|
||||
f.write(r.text)
|
||||
Reference in New Issue
Block a user