Summa_ipsum
Fact: regular lorem ipsum is nonsense latin. This really gets my goat. (No, I don't know how to say that in Latin, but I wish I did.)
[https://www.cresante.com/catholic/summa/summa_ipsum.py](UPDATED code is here), download and run it
It's written in perfect style, don't ask questions. Hack as necessary for maximum functionality.
So I set out to make a lorem ipsum generator that produced proper latin, using the text of Summa Theologiae. But first I need to get some proper source text.
Not being able to find a good source to download St. Thomas Aquinas' complete Summa in text form, I settled on scraping Wikisource, which has the full text.
However, manually scraping mediawiki pages is considered bad form, there is a much better alternative: use their [https://www.mediawiki.org/wiki/API:Main_page](provided API ).
Luckily, there is already a project for this, https://pypi.org/project/wikipedia/. After a small edit to the source files to change the API_URL variable (the library is intended for a very narrow scope: only with wikipedia, that's easily fixed), I wrote a small script to do the rest. Note, it is necessary to run
mkdir Summa\ Theologiae/{Prima,Tertia,Secunda\ secundae,Prima\ secundae}\ pars
to create the directory structure
Here's the python script:
#!/bin/env python
from random import choice
from os import chdir
from pathlib import Path
def init_download():
import wikipedia
import html2text
results = wikipedia.search("Summa Theologiae", results=1000)
h = html2text.HTML2Text()
h.ignore_links = True
h.ignore_images = True
def print_write(page):
html = wikipedia.page(page).html()
text = h.handle(html)
try:
with open(page, "w") as fd:
fd.write(text)
print(page)
except IsADirectoryError:
pass
for page in results:
if "Summa Theologiae" in page:
if "Quaestio" in page or "Prooemium" in page:
print_write(page)
# this downloads a few extra, non summa pages
# else if '/' not in page:
# print_write(page)
class SummaIpsum:
def __init__(self):
pass
def summa_ipsum(MINLEN=300, MAXLEN=600, NUMBER=5):
# TODO?: need logic to ensure MINLEN and MAXLEN aren't too near eachother
if Path('summa.txt.bz2').exists() and not Path('summa.txt').exists():
import bz2
print('decompressing')
with bz2.open('summa.txt.bz2', 'rt') as f:
lines = f.read()
with open('summa.txt', 'w') as fout:
fout.write(lines)
with open("summa.txt", "rt") as f:
# with bz2.open('summa.txt.bz2', 'rt') as f:
lines = f.read()
lines = lines.split("\n")
lines = [_ for _ in lines if _ and MINLEN < len(_) < MAXLEN]
res = []
for _ in range(NUMBER):
res.append(choice(lines))
# print(random_file)
# print(res)
# longest line
# print(len(sorted(lines,key=len)[-1]))
return res
def makelines():
import re
def capital_pct(line):
caps = sum(1 for c in line if c.isupper())
return caps / len(line)
def wordcount(line):
return len(line.split(" "))
checklines = []
chdir("latin")
count = 0
with open("../summa.txt", "w") as fout:
for p in Path.iterdir(Path.cwd()):
if not Path.is_file(p):
continue
with open(p) as fd:
lines = fd.read().split("\n\n")
lines = [_.replace("\n", " ") for _ in lines]
for line in lines:
if (
wordcount(line) > 12
and capital_pct(line) < 0.2
and line[0] != " "
and len(line) < 4000
and line[:8] != "Quaestio"
and line[:8] != "Prooemiu"
):
line = line.lstrip()
# remove pesky section preamble and numbering
if line[:2] == 'Iª':
r = re.compile(r"^Iª.*?(?=[A-Z])")
line = r.sub("", line)
if line[:1] == '[':
r = re.compile(r"^\[.*?(?=[A-Z])")
line = r.sub("", line)
if line[:4] == "IIae":
r = re.compile(r"^IIae .*?(?=[A-Z])")
line = r.sub("", line)
if line[0].islower():
line = line.capitalize()
if not line[0].isalpha():
r = re.compile(r"^.*?(?=[A-Z])")
line = r.sub("", line)
count += 1
fout.write(line + "\n")
checklines.append(line)
chdir("..")
return checklines
def line_length_chart():
with open("summa.txt", "r") as fd:
lines = fd.read().split("\n")
import matplotlib.pyplot as plt
import numpy as np
lens = [len(_) for _ in lines]
plt.hist(lens, bins=500)
plt.show()
if __name__ == "__main__":
lines = ""
lines = summa_ipsum()
for _ in lines:
print(_)
print("")