Summa_ipsum

Fact: regular lorem ipsum is nonsense latin. This really gets my goat. (No, I don't know how to say that in Latin, but I wish I did.)

[https://www.cresante.com/catholic/summa/summa_ipsum.py](UPDATED code is here), download and run it

It's written in perfect style, don't ask questions. Hack as necessary for maximum functionality.

So I set out to make a lorem ipsum generator that produced proper latin, using the text of Summa Theologiae. But first I need to get some proper source text.

Not being able to find a good source to download St. Thomas Aquinas' complete Summa in text form, I settled on scraping Wikisource, which has the full text.

However, manually scraping mediawiki pages is considered bad form, there is a much better alternative: use their [https://www.mediawiki.org/wiki/API:Main_page](provided API ).

Luckily, there is already a project for this, https://pypi.org/project/wikipedia/. After a small edit to the source files to change the API_URL variable (the library is intended for a very narrow scope: only with wikipedia, that's easily fixed), I wrote a small script to do the rest. Note, it is necessary to run

mkdir Summa\ Theologiae/{Prima,Tertia,Secunda\ secundae,Prima\ secundae}\ pars

to create the directory structure

Here's the python script:

python summa_ipsum.py


#!/bin/env python

from random import choice
from os  import chdir
from pathlib import Path


def init_download():
    import wikipedia
    import html2text

    results = wikipedia.search("Summa Theologiae", results=1000)

    h = html2text.HTML2Text()
    h.ignore_links = True
    h.ignore_images = True

    def print_write(page):
        html = wikipedia.page(page).html()
        text = h.handle(html)
        try:
            with open(page, "w") as fd:
                fd.write(text)
            print(page)
        except IsADirectoryError:
            pass

    for page in results:
        if "Summa Theologiae" in page:
            if "Quaestio" in page or "Prooemium" in page:
                print_write(page)
        # this downloads a few extra, non summa pages
        # else if '/' not in page:
        # print_write(page)


class SummaIpsum:
    def __init__(self):
        pass


def summa_ipsum(MINLEN=300, MAXLEN=600, NUMBER=5):
    # TODO?: need logic to ensure MINLEN and MAXLEN aren't too near eachother
    if Path('summa.txt.bz2').exists() and not Path('summa.txt').exists():
        import bz2
        print('decompressing')
        with bz2.open('summa.txt.bz2', 'rt') as f:
            lines = f.read()
            with open('summa.txt', 'w') as fout:
                fout.write(lines)

    with open("summa.txt", "rt") as f:
        # with bz2.open('summa.txt.bz2', 'rt') as f:
        lines = f.read()
    lines = lines.split("\n")
    lines = [_ for _ in lines if _ and MINLEN < len(_) < MAXLEN]
    res = []
    for _ in range(NUMBER):
        res.append(choice(lines))
    # print(random_file)
    # print(res)
    # longest line
    # print(len(sorted(lines,key=len)[-1]))
    return res


def makelines():
    import re

    def capital_pct(line):
        caps = sum(1 for c in line if c.isupper())
        return caps / len(line)

    def wordcount(line):
        return len(line.split(" "))
    checklines = []
    chdir("latin")
    count = 0
    with open("../summa.txt", "w") as fout:
        for p in Path.iterdir(Path.cwd()):
            if not Path.is_file(p):
                continue
            with open(p) as fd:
                lines = fd.read().split("\n\n")
            lines = [_.replace("\n", " ") for _ in lines]
            for line in lines:
                if (
                    wordcount(line) > 12
                    and capital_pct(line) < 0.2
                    and line[0] != " "
                    and len(line) < 4000
                    and line[:8] != "Quaestio"
                    and line[:8] != "Prooemiu"
                ):
                    line = line.lstrip()
                    # remove pesky section preamble and numbering
                    if line[:2] == 'Iª':
                        r = re.compile(r"^Iª.*?(?=[A-Z])")
                        line = r.sub("", line)
                    if line[:1] == '[':
                        r = re.compile(r"^\[.*?(?=[A-Z])")
                        line = r.sub("", line)
                    if line[:4] == "IIae":
                        r = re.compile(r"^IIae .*?(?=[A-Z])")
                        line = r.sub("", line)
                    if line[0].islower():
                        line = line.capitalize()
                    if not line[0].isalpha():
                        r = re.compile(r"^.*?(?=[A-Z])")
                        line = r.sub("", line)
                    count += 1
                    fout.write(line + "\n")
                    checklines.append(line)
    chdir("..")
    return checklines


def line_length_chart():
    with open("summa.txt", "r") as fd:
        lines = fd.read().split("\n")
    import matplotlib.pyplot as plt
    import numpy as np

    lens = [len(_) for _ in lines]
    plt.hist(lens, bins=500)
    plt.show()


if __name__ == "__main__":
    lines = ""
    lines = summa_ipsum()
    for _ in lines:
        print(_)
        print("")