imgin/imgin/get.py

import sys
from os import remove, write
from threading import Thread

from time import sleep
import requests
import bs4

from .useragents import get_random_user_agent
from .config import IMAGE_CACHE, SINGLE_IMAGE_DELETE_AFTER_SECS

def delete_file(path):
    sleep(SINGLE_IMAGE_DELETE_AFTER_SECS)
    print('Erasing', path)
    try:
        remove(path)
    except FileNotFoundError:
        pass


def error(msg):
    sys.stderr.write(msg + "\n")
    sys.stderr.flush()

def get(url: str, write_dir: str, delete=True):
    ua = get_random_user_agent()
    orig_url = url
    if not url.startswith('https://imgur.com/'):
        url = 'https://imgur.com/' + url

    album = False
    if url.startswith("https://imgur.com/a/"):
        album = True
        if not url.endswith("blog"):
            url += "/layout/blog"


    if not album:
        print('Getting img', url)
        url = 'https://i.imgur.com/' + url.rsplit('/', 1)[-1]
        with open(f'{write_dir}/{url[-11:]}', 'wb') as img:
            img.write(requests.get(url, headers={'User-Agent': ua}).content)
        if delete:
            Thread(target=delete_file, args=[f"{write_dir}/{url[-11:]}"]).start()
        return None
    else:
        found_url = ''
        found_urls = []
        found_list_file = ''
        title = ''
        metas = []
        print('Detecting album/gallery images (contentUrl)', url)
        soup = bs4.BeautifulSoup(requests.get(url, headers={'User-Agent': ua}).text, 'html.parser')
        try:
            title = soup.select('meta[property="og:title"]')[0]['content']
            if title == "Imgur":
                title = ''
        except (KeyError, IndexError):
            title = ''
        for count, el in enumerate(soup.select('.post-image-container'), start=1):
            if el is None:
                continue
            minisoup = bs4.BeautifulSoup(str(el), 'html.parser')
            try:
                found_url = "https:" + minisoup.select('.post-image meta[itemprop="contentUrl"]')[0]['content']
                if '?1' in found_url:
                    continue
            except (KeyError, IndexError):
                error("Could not obtain url for detected image (contentUrl), trying id method")
                try:
                    found_url = "https://i.imgur.com/" + el['id'] + ".jpg" # equivalent to .png
                except KeyError:
                    error("Could not obtain url for detected image (id)")
                    continue
            if found_url.endswith('ico.jpg'):
                continue
            found_urls.append(found_url[-11:])
            print(f"Downloading image {count}: {found_url}")

            print("Writing image", f"{write_dir}{found_url[-11:]}")
            with open(f"{write_dir}{found_url[-11:]}", "wb") as f:
                f.write(requests.get(found_url, headers={'User-Agent': ua}).content)

            if delete:
                Thread(target=delete_file, args=[f"{write_dir}{found_url[-11:]}"]).start()

            subtitle = ''
            try:
                subtitle = minisoup.select('.post-image-title')[0].string
            except IndexError:
                subtitle = ''
            desc = ''
            try:
                desc = minisoup.select('.post-image-description')[0].string
            except IndexError:
                desc = ''
            date = ''
            metas.append((subtitle, desc))
        # Write the found urls to a file with the name of the album so the viewer endpoint can get them
        found_list_file = write_dir + orig_url.replace('/', '_')
        with open(found_list_file, 'w') as f:
            f.write(','.join(found_urls))
        Thread(target=delete_file, args=[found_list_file]).start()
        return title, metas
initial commit 2021-10-03 00:56:33 +00:00			`import sys`
added test coverage for image getter 2021-10-11 01:44:23 +00:00			`from os import remove, write`
initial commit 2021-10-03 00:56:33 +00:00			`from threading import Thread`

added test coverage for image getter 2021-10-11 01:44:23 +00:00			`from time import sleep`
initial commit 2021-10-03 00:56:33 +00:00			`import requests`
			`import bs4`

Randomize useragent 2023-08-23 18:44:22 +00:00			`from .useragents import get_random_user_agent`
Fix #2 Added version number to index 2021-10-05 18:40:48 +00:00			`from .config import IMAGE_CACHE, SINGLE_IMAGE_DELETE_AFTER_SECS`
initial commit 2021-10-03 00:56:33 +00:00
			`def delete_file(path):`
			`sleep(SINGLE_IMAGE_DELETE_AFTER_SECS)`
			`print('Erasing', path)`
			`try:`
			`remove(path)`
			`except FileNotFoundError:`
			`pass`


			`def error(msg):`
			`sys.stderr.write(msg + "\n")`
			`sys.stderr.flush()`

			`def get(url: str, write_dir: str, delete=True):`
Randomize useragent 2023-08-23 18:44:22 +00:00			`ua = get_random_user_agent()`
Fix #2 Added version number to index 2021-10-05 18:40:48 +00:00			`orig_url = url`
initial commit 2021-10-03 00:56:33 +00:00			`if not url.startswith('https://imgur.com/'):`
better path handling 2022-01-09 19:30:21 +00:00			`url = 'https://imgur.com/' + url`
initial commit 2021-10-03 00:56:33 +00:00
			`album = False`
better path handling 2022-01-09 19:30:21 +00:00			`if url.startswith("https://imgur.com/a/"):`
initial commit 2021-10-03 00:56:33 +00:00			`album = True`
			`if not url.endswith("blog"):`
			`url += "/layout/blog"`
append jpg if there isnt one for single image 2021-10-03 23:09:58 +00:00
initial commit 2021-10-03 00:56:33 +00:00
			`if not album:`
fixed broken single images 2021-10-03 22:04:34 +00:00			`print('Getting img', url)`
Fix #2 Added version number to index 2021-10-05 18:40:48 +00:00			`url = 'https://i.imgur.com/' + url.rsplit('/', 1)[-1]`
fixed broken single images 2021-10-03 22:04:34 +00:00			`with open(f'{write_dir}/{url[-11:]}', 'wb') as img:`
Randomize useragent 2023-08-23 18:44:22 +00:00			`img.write(requests.get(url, headers={'User-Agent': ua}).content)`
initial commit 2021-10-03 00:56:33 +00:00			`if delete:`
fixed broken single images 2021-10-03 22:04:34 +00:00			`Thread(target=delete_file, args=[f"{write_dir}/{url[-11:]}"]).start()`
render album title 2022-01-09 19:52:28 +00:00			`return None`
initial commit 2021-10-03 00:56:33 +00:00			`else:`
refactor #4 2022-01-09 19:32:26 +00:00			`found_url = ''`
			`found_urls = []`
			`found_list_file = ''`
render descriptions and small titles 2022-01-09 20:26:35 +00:00			`title = ''`
			`metas = []`
add alternative album detection method 2022-01-06 23:11:05 +00:00			`print('Detecting album/gallery images (contentUrl)', url)`
Randomize useragent 2023-08-23 18:44:22 +00:00			`soup = bs4.BeautifulSoup(requests.get(url, headers={'User-Agent': ua}).text, 'html.parser')`
render album title 2022-01-09 19:52:28 +00:00			`try:`
			`title = soup.select('meta[property="og:title"]')[0]['content']`
			`if title == "Imgur":`
			`title = ''`
			`except (KeyError, IndexError):`
			`title = ''`
refactor #4 2022-01-09 19:32:26 +00:00			`for count, el in enumerate(soup.select('.post-image-container'), start=1):`
			`if el is None:`
			`continue`
			`minisoup = bs4.BeautifulSoup(str(el), 'html.parser')`
initial commit 2021-10-03 00:56:33 +00:00			`try:`
refactor #4 2022-01-09 19:32:26 +00:00			`found_url = "https:" + minisoup.select('.post-image meta[itemprop="contentUrl"]')[0]['content']`
added test coverage for image getter 2021-10-11 01:44:23 +00:00			`if '?1' in found_url:`
			`continue`
refactor #4 2022-01-09 19:32:26 +00:00			`except (KeyError, IndexError):`
			`error("Could not obtain url for detected image (contentUrl), trying id method")`
add alternative album detection method 2022-01-06 23:11:05 +00:00			`try:`
			`found_url = "https://i.imgur.com/" + el['id'] + ".jpg" # equivalent to .png`
			`except KeyError:`
			`error("Could not obtain url for detected image (id)")`
			`continue`
refactor #4 2022-01-09 19:32:26 +00:00			`if found_url.endswith('ico.jpg'):`
			`continue`
			`found_urls.append(found_url[-11:])`
			`print(f"Downloading image {count}: {found_url}")`

			`print("Writing image", f"{write_dir}{found_url[-11:]}")`
			`with open(f"{write_dir}{found_url[-11:]}", "wb") as f:`
Randomize useragent 2023-08-23 18:44:22 +00:00			`f.write(requests.get(found_url, headers={'User-Agent': ua}).content)`
refactor #4 2022-01-09 19:32:26 +00:00
			`if delete:`
			`Thread(target=delete_file, args=[f"{write_dir}{found_url[-11:]}"]).start()`
render descriptions and small titles 2022-01-09 20:26:35 +00:00
			`subtitle = ''`
			`try:`
			`subtitle = minisoup.select('.post-image-title')[0].string`
			`except IndexError:`
			`subtitle = ''`
			`desc = ''`
			`try:`
			`desc = minisoup.select('.post-image-description')[0].string`
			`except IndexError:`
			`desc = ''`
			`date = ''`
			`metas.append((subtitle, desc))`
Fix #2 Added version number to index 2021-10-05 18:40:48 +00:00			`# Write the found urls to a file with the name of the album so the viewer endpoint can get them`
added test coverage for image getter 2021-10-11 01:44:23 +00:00			`found_list_file = write_dir + orig_url.replace('/', '_')`
Fix #2 Added version number to index 2021-10-05 18:40:48 +00:00			`with open(found_list_file, 'w') as f:`
			`f.write(','.join(found_urls))`
			`Thread(target=delete_file, args=[found_list_file]).start()`
render descriptions and small titles 2022-01-09 20:26:35 +00:00			`return title, metas`