Okay, it's working. I think I just need to... save the tuples of hash to URLs (both photobucket and imgur), and add code to automagically upload to imgur using its API. Actually, I need to remove that PA_THREAD and TEST_URL stuff.
from bs4 import BeautifulSoup
import requests
import sys, os
import hashlib, base64
from io import BytesIO
from urllib.parse import urlparse
from os.path import splitext, basename
UA = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36"
TEST_URL = 'http://i984.photobucket.com/albums/ae321/isaacscr/Misc/HPIM5242.jpg'
# haha i'm totally Chrome
HEADERS = {'Upgrade-Insecure-Requests': '1',
'User-Agent': UA,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'DNT': '1'}
def filename_of_url(u):
img_url_path = basename(urlparse(u).path)
img_ext = os.path.splitext(img_url_path)[1]
url_hash = base64.urlsafe_b64encode(hashlib.sha3_224(u.encode('utf-8')).digest()).decode("ascii")
return url_hash + "." + img_ext
# Returns: None
# Saves PhotoBucket image at given url
def fetch_pb_image(url):
try:
s = requests.Session()
s.headers.update(HEADERS)
req1 = s.get(url)
s.headers.update({'referer': req1.url})
img_req = s.get(url)#.replace('http', 'https'))
img_data = BytesIO(img_req.content)
img_filename = filename_of_url(url)
with open(img_filename, 'wb') as out:
out.write(img_data.read())
except Exception as err:
# Anything could've gone wrong. Hope that it doesn't for
# the next image.
# stderr should be saved to log failed images
print(err.__class__, file=sys.stderr)
print(err, file=sys.stderr)
print("failed to get", url, file=sys.stderr)
pa_session = requests.Session()
PA_THREAD = "http://www.primitivearcher.com/smf/index.php/topic,27206.0.html"
thread_page = BeautifulSoup(requests.get(PA_THREAD).content, "html.parser")
current_pagenum = 420
TEST_URL = 'http://i1278.photobucket.com/albums/y506/psmith311/Mobile%20Uploads/2017-05/3896EDFF-DA57-41F2-9D8B-DDF95DED3F01_zps70czmzvh.jpg'
fetch_pb_image(TEST_URL)
def process_thread(url, session):
while True:
thread_page = BeautifulSoup(session.get(url).content, "html.parser")
thread_pagelinks = thread_page.select(".pagelinks")[0]
thread_pageno = int(thread_pagelinks.find("strong").text)
next_page_link = thread_pagelinks.find("a", text=str(thread_pageno + 1))
pb_imgs = [img.get("src") for img in thread_page.find_all("img") if "photobucket.com" in img.get("src")]
for pb_img in pb_imgs:
fetch_pb_image(pb_img)
if next_page_link:
url = next_page_link.get("href")
else:
break
pa_session = requests.Session()
process_thread("http://www.primitivearcher.com/smf/index.php/topic,60633.0.html", pa_session)