Any way to download all images from an lemmy communities

rumimevlevi · 4 months ago

Any way to download all images from an lemmy communities

golden_zealot@lemmy.ml · 4 months ago

There isn’t a straightforward way to do it as far as I can see - most likely because instances usually don’t want tonnes of requests for tonnes of data.

If you have knowledge in programming it would be feasible to write a script that either uses the Lemmy API to get this, or otherwise web scrape it.

geneva_convenience@lemmy.ml · edit-2 4 months ago

People tend to hate on AI but this is what it was made for.

all images part1 https://files.catbox.moe/1o0cgg.zip

all images part2 https://files.catbox.moe/t3pk4k.zip

Slop generated AI script:


import os
import requests
from urllib.parse import urlparse
from tqdm import tqdm
import logging

# Set up logging
logging.basicConfig(
    level=logging.DEBUG,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Config
COMMUNITY = "albumartporn"
LEMMA_INSTANCE = "https://lemmy.world/"  # You can change this to a different instance
DEST_FOLDER = "albumartporn_images"
MAX_PAGES = 100  # Increased to download more images
SORT = "TopAll"  # Changed to get the best quality images first

os.makedirs(DEST_FOLDER, exist_ok=True)

def get_posts(page):
    url = f"{LEMMA_INSTANCE}/api/v3/post/list"
    params = {
        "community_name": COMMUNITY,
        "sort": SORT,
        "page": page
    }
    try:
        logger.debug(f"Fetching posts from page {page}")
        resp = requests.get(url, params=params)
        resp.raise_for_status()
        posts = resp.json().get("posts", [])
        logger.debug(f"Found {len(posts)} posts on page {page}")
        return posts
    except Exception as e:
        logger.error(f"Error fetching posts from page {page}: {e}")
        return []

def download_image(url, filename):
    try:
        logger.debug(f"Downloading image from {url}")
        resp = requests.get(url, stream=True, timeout=10)
        resp.raise_for_status()
        file_size = int(resp.headers.get('content-length', 0))
        logger.debug(f"Image size: {file_size} bytes")
        
        with open(filename, "wb") as f:
            for chunk in resp.iter_content(1024):
                f.write(chunk)
        logger.debug(f"Successfully downloaded {filename}")
    except Exception as e:
        logger.error(f"Failed to download {url}: {e}")

def is_image_url(url):
    is_img = url.lower().endswith((".jpg", ".jpeg", ".png", ".gif", ".webp"))
    logger.debug(f"URL {url} is image: {is_img}")
    return is_img

def main():
    logger.info(f"Starting download from {COMMUNITY} community")
    logger.info(f"Sorting by: {SORT}")
    logger.info(f"Maximum pages to process: {MAX_PAGES}")
    
    image_count = 0
    for page in range(1, MAX_PAGES + 1):
        logger.info(f"Processing page {page}/{MAX_PAGES}")
        posts = get_posts(page)
        if not posts:
            logger.warning(f"No more posts on page {page}.")
            break

        for post in tqdm(posts, desc=f"Page {page}"):
            post_data = post.get("post", {})
            url = post_data.get("url")

            if not url:
                logger.debug("Post has no URL, skipping")
                continue

            if not is_image_url(url):
                logger.debug(f"URL is not an image: {url}")
                continue

            parsed_url = urlparse(url)
            filename = os.path.basename(parsed_url.path)
            filepath = os.path.join(DEST_FOLDER, filename)
            
            if os.path.exists(filepath):
                logger.debug(f"File already exists: {filepath}")
                continue
                
            download_image(url, filepath)
            image_count += 1

    logger.info(f"✅ Download complete. Downloaded {image_count} images.")

if __name__ == "__main__":
    main()