From 006e7d4e381ebf7924cbe9074abaa6ffd060f3d6 Mon Sep 17 00:00:00 2001 From: Joscha Date: Wed, 26 Apr 2023 20:40:16 +0200 Subject: [PATCH] Find imgur image and album ids --- archive_imgur_images/find_images.py | 72 +++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100755 archive_imgur_images/find_images.py diff --git a/archive_imgur_images/find_images.py b/archive_imgur_images/find_images.py new file mode 100755 index 0000000..cd4921f --- /dev/null +++ b/archive_imgur_images/find_images.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python3 + +import argparse +import json +import re +import sys +from pathlib import Path + +DESCRIPTION = """ +Find imgur images and albums linked in message contents and store them in a json +file. +""" + +EPILOG = """ +This program expects the output of `cove export -o - -f json-stream ` on +stdin. To run the script on all messages in your vault, pass `-a` to cove +instead of specifying one or more rooms directly. +""" + +IMAGE_RE = re.compile(r"imgur.com/([a-zA-Z0-9]+)") +ALBUM_RE = re.compile(r"imgur.com/a/([a-zA-Z0-9]+)") + + +def main(): + parser = argparse.ArgumentParser( + description=DESCRIPTION, + epilog=EPILOG, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument( + "--info", + "-i", + type=Path, + default=Path("imgur_images.json"), + help="the json file to save the links in", + ) + args = parser.parse_args() + + image_ids = set() + album_ids = set() + + sifted = 0 + for line in sys.stdin: + msg = json.loads(line) + for match in IMAGE_RE.finditer(msg["content"]): + image_ids.add(match.group(1)) + for match in ALBUM_RE.finditer(msg["content"]): + album_ids.add(match.group(1)) + + sifted += 1 + if sifted % 100_000 == 0: + print( + f"Sifted through {sifted:_} messages, " + f"found {len(image_ids):_} image ids, " + f"{len(album_ids):_} album ids so far" + ) + + print(f"Sifted through {sifted:_} messages in total") + print(f"Found {len(image_ids):_} unique image ids") + print(f"Found {len(album_ids):_} unique album ids") + + print("Saving image and album ids") + data = { + "image_ids": list(sorted(image_ids)), + "album_ids": list(sorted(album_ids)), + } + with open(args.info, "w") as f: + json.dump(data, f, indent=2) + + +if __name__ == "__main__": + main()