Find imgur image and album ids
This commit is contained in:
parent
81a5375421
commit
006e7d4e38
1 changed files with 72 additions and 0 deletions
72
archive_imgur_images/find_images.py
Executable file
72
archive_imgur_images/find_images.py
Executable file
|
|
@ -0,0 +1,72 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
DESCRIPTION = """
|
||||
Find imgur images and albums linked in message contents and store them in a json
|
||||
file.
|
||||
"""
|
||||
|
||||
EPILOG = """
|
||||
This program expects the output of `cove export -o - -f json-stream <room>` on
|
||||
stdin. To run the script on all messages in your vault, pass `-a` to cove
|
||||
instead of specifying one or more rooms directly.
|
||||
"""
|
||||
|
||||
IMAGE_RE = re.compile(r"imgur.com/([a-zA-Z0-9]+)")
|
||||
ALBUM_RE = re.compile(r"imgur.com/a/([a-zA-Z0-9]+)")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description=DESCRIPTION,
|
||||
epilog=EPILOG,
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--info",
|
||||
"-i",
|
||||
type=Path,
|
||||
default=Path("imgur_images.json"),
|
||||
help="the json file to save the links in",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
image_ids = set()
|
||||
album_ids = set()
|
||||
|
||||
sifted = 0
|
||||
for line in sys.stdin:
|
||||
msg = json.loads(line)
|
||||
for match in IMAGE_RE.finditer(msg["content"]):
|
||||
image_ids.add(match.group(1))
|
||||
for match in ALBUM_RE.finditer(msg["content"]):
|
||||
album_ids.add(match.group(1))
|
||||
|
||||
sifted += 1
|
||||
if sifted % 100_000 == 0:
|
||||
print(
|
||||
f"Sifted through {sifted:_} messages, "
|
||||
f"found {len(image_ids):_} image ids, "
|
||||
f"{len(album_ids):_} album ids so far"
|
||||
)
|
||||
|
||||
print(f"Sifted through {sifted:_} messages in total")
|
||||
print(f"Found {len(image_ids):_} unique image ids")
|
||||
print(f"Found {len(album_ids):_} unique album ids")
|
||||
|
||||
print("Saving image and album ids")
|
||||
data = {
|
||||
"image_ids": list(sorted(image_ids)),
|
||||
"album_ids": list(sorted(album_ids)),
|
||||
}
|
||||
with open(args.info, "w") as f:
|
||||
json.dump(data, f, indent=2)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue