imgin/imgin/get.py

105 lines
3.7 KiB
Python
Raw Permalink Normal View History

2021-10-03 00:56:33 +00:00
import sys
2021-10-11 01:44:23 +00:00
from os import remove, write
2021-10-03 00:56:33 +00:00
from threading import Thread
2021-10-11 01:44:23 +00:00
from time import sleep
2021-10-03 00:56:33 +00:00
import requests
import bs4
2023-08-23 18:44:22 +00:00
from .useragents import get_random_user_agent
2021-10-05 18:40:48 +00:00
from .config import IMAGE_CACHE, SINGLE_IMAGE_DELETE_AFTER_SECS
2021-10-03 00:56:33 +00:00
def delete_file(path):
sleep(SINGLE_IMAGE_DELETE_AFTER_SECS)
print('Erasing', path)
try:
remove(path)
except FileNotFoundError:
pass
def error(msg):
sys.stderr.write(msg + "\n")
sys.stderr.flush()
def get(url: str, write_dir: str, delete=True):
2023-08-23 18:44:22 +00:00
ua = get_random_user_agent()
2021-10-05 18:40:48 +00:00
orig_url = url
2021-10-03 00:56:33 +00:00
if not url.startswith('https://imgur.com/'):
2022-01-09 19:30:21 +00:00
url = 'https://imgur.com/' + url
2021-10-03 00:56:33 +00:00
album = False
2022-01-09 19:30:21 +00:00
if url.startswith("https://imgur.com/a/"):
2021-10-03 00:56:33 +00:00
album = True
if not url.endswith("blog"):
url += "/layout/blog"
2021-10-03 00:56:33 +00:00
if not album:
2021-10-03 22:04:34 +00:00
print('Getting img', url)
2021-10-05 18:40:48 +00:00
url = 'https://i.imgur.com/' + url.rsplit('/', 1)[-1]
2021-10-03 22:04:34 +00:00
with open(f'{write_dir}/{url[-11:]}', 'wb') as img:
2023-08-23 18:44:22 +00:00
img.write(requests.get(url, headers={'User-Agent': ua}).content)
2021-10-03 00:56:33 +00:00
if delete:
2021-10-03 22:04:34 +00:00
Thread(target=delete_file, args=[f"{write_dir}/{url[-11:]}"]).start()
2022-01-09 19:52:28 +00:00
return None
2021-10-03 00:56:33 +00:00
else:
2022-01-09 19:32:26 +00:00
found_url = ''
found_urls = []
found_list_file = ''
2022-01-09 20:26:35 +00:00
title = ''
metas = []
2022-01-06 23:11:05 +00:00
print('Detecting album/gallery images (contentUrl)', url)
2023-08-23 18:44:22 +00:00
soup = bs4.BeautifulSoup(requests.get(url, headers={'User-Agent': ua}).text, 'html.parser')
2022-01-09 19:52:28 +00:00
try:
title = soup.select('meta[property="og:title"]')[0]['content']
if title == "Imgur":
title = ''
except (KeyError, IndexError):
title = ''
2022-01-09 19:32:26 +00:00
for count, el in enumerate(soup.select('.post-image-container'), start=1):
if el is None:
continue
minisoup = bs4.BeautifulSoup(str(el), 'html.parser')
2021-10-03 00:56:33 +00:00
try:
2022-01-09 19:32:26 +00:00
found_url = "https:" + minisoup.select('.post-image meta[itemprop="contentUrl"]')[0]['content']
2021-10-11 01:44:23 +00:00
if '?1' in found_url:
continue
2022-01-09 19:32:26 +00:00
except (KeyError, IndexError):
error("Could not obtain url for detected image (contentUrl), trying id method")
2022-01-06 23:11:05 +00:00
try:
found_url = "https://i.imgur.com/" + el['id'] + ".jpg" # equivalent to .png
except KeyError:
error("Could not obtain url for detected image (id)")
continue
2022-01-09 19:32:26 +00:00
if found_url.endswith('ico.jpg'):
continue
found_urls.append(found_url[-11:])
print(f"Downloading image {count}: {found_url}")
print("Writing image", f"{write_dir}{found_url[-11:]}")
with open(f"{write_dir}{found_url[-11:]}", "wb") as f:
2023-08-23 18:44:22 +00:00
f.write(requests.get(found_url, headers={'User-Agent': ua}).content)
2022-01-09 19:32:26 +00:00
if delete:
Thread(target=delete_file, args=[f"{write_dir}{found_url[-11:]}"]).start()
2022-01-09 20:26:35 +00:00
subtitle = ''
try:
subtitle = minisoup.select('.post-image-title')[0].string
except IndexError:
subtitle = ''
desc = ''
try:
desc = minisoup.select('.post-image-description')[0].string
except IndexError:
desc = ''
date = ''
metas.append((subtitle, desc))
2021-10-05 18:40:48 +00:00
# Write the found urls to a file with the name of the album so the viewer endpoint can get them
2021-10-11 01:44:23 +00:00
found_list_file = write_dir + orig_url.replace('/', '_')
2021-10-05 18:40:48 +00:00
with open(found_list_file, 'w') as f:
f.write(','.join(found_urls))
Thread(target=delete_file, args=[found_list_file]).start()
2022-01-09 20:26:35 +00:00
return title, metas