Initial commit
This commit is contained in:
118
scraper_utilities/program.py
Normal file
118
scraper_utilities/program.py
Normal file
@@ -0,0 +1,118 @@
|
||||
import requests
|
||||
from scrapy import Selector
|
||||
import json
|
||||
|
||||
|
||||
headers = {
|
||||
"content-type": "application/json",
|
||||
"accept": "application/json",
|
||||
"referer": "https://www.causeiq.com/directory/business-and-community-development-organizations-list/",
|
||||
"origin": "https://www.causeiq.com",
|
||||
"sec-fetch-site": "same-origin",
|
||||
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36",
|
||||
}
|
||||
|
||||
payload = {
|
||||
"filters": [],
|
||||
"pageNumber": 1,
|
||||
"sortDir": "desc",
|
||||
"sortHow": "popularity"
|
||||
}
|
||||
|
||||
url = 'https://www.causeiq.com/directory/business-and-community-development-organizations-list/'
|
||||
|
||||
r_init = requests.get(url, headers=headers)
|
||||
cookies = r_init.cookies
|
||||
print(r_init.headers['set-cookie'])
|
||||
headers['x-csrftoken'] = cookies['csrftoken']
|
||||
|
||||
|
||||
stations = requests.get('https://www.causeiq.com/directory/retrieve_variable/metros/')
|
||||
stations = json.loads(stations.text).keys()
|
||||
|
||||
|
||||
def get_stations():
|
||||
print('Total stations: ', len(stations))
|
||||
|
||||
total_companies = 0
|
||||
for i in stations:
|
||||
filters = [{
|
||||
"hash": "1",
|
||||
"type": "metro",
|
||||
"value": str(i)
|
||||
}]
|
||||
payload['filters'] = filters
|
||||
r = requests.post(
|
||||
url,
|
||||
json.dumps(payload),
|
||||
headers=headers,
|
||||
cookies=cookies
|
||||
)
|
||||
|
||||
decoded = json.loads(r.text)
|
||||
total_companies += decoded['count']
|
||||
s = 'Got metro %s with %s companies. Current total: %s' % (i, decoded['count'], total_companies)
|
||||
|
||||
|
||||
def get_full_station(station):
|
||||
filters = [{
|
||||
"hash": "1",
|
||||
"type": "metro",
|
||||
"value": station
|
||||
}]
|
||||
payload['filters'] = filters
|
||||
r = requests.post(
|
||||
url,
|
||||
json.dumps(payload),
|
||||
headers=headers,
|
||||
cookies=cookies
|
||||
)
|
||||
decoded = json.loads(r.text)
|
||||
print(decoded['count'])
|
||||
|
||||
letters = 'abcdefghijklmnopqrstuvwxyz'
|
||||
|
||||
total = 0
|
||||
for l in letters:
|
||||
filters = [
|
||||
{
|
||||
"hash": "1",
|
||||
"type": "metro",
|
||||
"value": station
|
||||
},
|
||||
{
|
||||
"hash": "2",
|
||||
"type": "keywords",
|
||||
"value": l
|
||||
}
|
||||
]
|
||||
payload['filters'] = filters
|
||||
r = requests.post(
|
||||
url,
|
||||
json.dumps(payload),
|
||||
headers=headers,
|
||||
cookies=cookies
|
||||
)
|
||||
decoded = json.loads(r.text)
|
||||
total += decoded['count']
|
||||
print('Got %s companies from letter %s. Total: %s' % (decoded['count'], l, total))
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# print("one")
|
||||
|
||||
# print('two')
|
||||
# payload['pageNumber'] = 2
|
||||
# r = requests.post(
|
||||
# 'https://www.causeiq.com/directory/business-and-community-development-organizations-list/',
|
||||
# json.dumps(payload),
|
||||
# headers=headers,
|
||||
# cookies=cookies
|
||||
# )
|
||||
print("end")
|
||||
|
||||
# filename = 'responses.json'
|
||||
# with open(filename, 'w') as f:
|
||||
# f.write(r.text)
|
||||
Reference in New Issue
Block a user