Files
useful/scraper_utilities/program.py
2024-07-13 00:53:45 +03:00

119 lines
2.5 KiB
Python

import requests
from scrapy import Selector
import json
headers = {
"content-type": "application/json",
"accept": "application/json",
"referer": "https://www.causeiq.com/directory/business-and-community-development-organizations-list/",
"origin": "https://www.causeiq.com",
"sec-fetch-site": "same-origin",
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36",
}
payload = {
"filters": [],
"pageNumber": 1,
"sortDir": "desc",
"sortHow": "popularity"
}
url = 'https://www.causeiq.com/directory/business-and-community-development-organizations-list/'
r_init = requests.get(url, headers=headers)
cookies = r_init.cookies
print(r_init.headers['set-cookie'])
headers['x-csrftoken'] = cookies['csrftoken']
stations = requests.get('https://www.causeiq.com/directory/retrieve_variable/metros/')
stations = json.loads(stations.text).keys()
def get_stations():
print('Total stations: ', len(stations))
total_companies = 0
for i in stations:
filters = [{
"hash": "1",
"type": "metro",
"value": str(i)
}]
payload['filters'] = filters
r = requests.post(
url,
json.dumps(payload),
headers=headers,
cookies=cookies
)
decoded = json.loads(r.text)
total_companies += decoded['count']
s = 'Got metro %s with %s companies. Current total: %s' % (i, decoded['count'], total_companies)
def get_full_station(station):
filters = [{
"hash": "1",
"type": "metro",
"value": station
}]
payload['filters'] = filters
r = requests.post(
url,
json.dumps(payload),
headers=headers,
cookies=cookies
)
decoded = json.loads(r.text)
print(decoded['count'])
letters = 'abcdefghijklmnopqrstuvwxyz'
total = 0
for l in letters:
filters = [
{
"hash": "1",
"type": "metro",
"value": station
},
{
"hash": "2",
"type": "keywords",
"value": l
}
]
payload['filters'] = filters
r = requests.post(
url,
json.dumps(payload),
headers=headers,
cookies=cookies
)
decoded = json.loads(r.text)
total += decoded['count']
print('Got %s companies from letter %s. Total: %s' % (decoded['count'], l, total))
# print("one")
# print('two')
# payload['pageNumber'] = 2
# r = requests.post(
# 'https://www.causeiq.com/directory/business-and-community-development-organizations-list/',
# json.dumps(payload),
# headers=headers,
# cookies=cookies
# )
print("end")
# filename = 'responses.json'
# with open(filename, 'w') as f:
# f.write(r.text)