scotusFeed/feed.py

116 lines
4.8 KiB
Python
Raw Normal View History

2018-05-22 02:15:20 +00:00
import io
import re
2018-05-20 21:45:39 +00:00
import requests
import argparse
2018-05-22 02:15:20 +00:00
import logging
2018-05-20 21:45:39 +00:00
from bs4 import BeautifulSoup
from feedgen.feed import FeedGenerator
from dateutil import parser
from datetime import timezone
2018-05-22 02:15:20 +00:00
from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTTextBox, LTTextLine
2018-05-20 21:45:39 +00:00
def feedbase():
fg = FeedGenerator()
fg.load_extension('podcast')
2018-05-22 00:50:00 +00:00
fg.title('SCOTUS Audio ' + TERM + 'Term')
2018-05-20 21:45:39 +00:00
fg.subtitle('This is an automated feed of the mp3 files from the SCOTUS website. NOT AFFILIATED WITH THE COURT OR THE JUSTICES.')
fg.link(href=LINK, rel='self')
fg.language('en')
if HOME:
fg.link(href=HOME, rel='alternate')
if LOGO:
fg.logo(LOGO)
return fg
def get_filesize(argument_id):
return requests.head('https://www.supremecourt.gov/media/audio/mp3files/' + argument_id + '.mp3').headers['content-length']
def add_argument(feed, argument_id, argument_number, argument_title, argument_date, docket_number):
fe = feed.add_entry(order='append')
url = "https://www.supremecourt.gov/oral_arguments/audio/" + TERM + "/" + argument_id
fe.id(url)
fe.title(argument_number + ": " + argument_title)
fe.link(href=url)
fe.enclosure('https://www.supremecourt.gov/media/audio/mp3files/' + argument_id + '.mp3', get_filesize(argument_id), 'audio/mpeg')
fe.published(argument_date)
2018-05-22 02:15:20 +00:00
fe.description(parse_qp(argument_number) + "\nThe Supreme Court docket for this case is available at https://www.supremecourt.gov/docket/docketfiles/html/public/" + docket_number + ".html.")
def parse_qp(docket_number):
if "-Orig" in docket_number:
docket = docket_number.split("-")[0] + ' orig'
else:
split_docket = docket_number.split("-")
docket = '{term}-{num:05d}'.format(term=split_docket[0], num=int(split_docket[1]))
fp = io.BytesIO(requests.get("https://www.supremecourt.gov/qp/" + docket + "qp.pdf").content)
parser = PDFParser(fp)
doc = PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize('')
rsrcmgr = PDFResourceManager()
laparams = LAParams()
laparams.char_margin = 1.0
laparams.word_margin = 1.0
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
extracted_text = ''
for page in doc.get_pages():
interpreter.process_page(page)
layout = device.get_result()
for lt_obj in layout:
if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
text = lt_obj.get_text().replace("(cid:160)", " ")
if ("LOWER COURT CASE NUMBER:" not in text) and ("DECISION BELOW:" not in text):
extracted_text += text
return re.sub(' +', ' ', extracted_text)
2018-05-20 21:45:39 +00:00
def parse_sessions(feed, sessions):
for session in sessions:
for argument in session.find_all("tr")[:0:-1]: # pop off the header and invert
argument_number = argument.a.string
if "-Orig" in argument_number:
2018-05-22 02:15:20 +00:00
# magic docket number for now, see
# https://www.cocklelegalbriefs.com/blog/supreme-court/the-u-s-supreme-courts-use-of-docket-numbers/
docket_number = "22o" + argument_number.split("-")[0]
2018-05-22 01:07:27 +00:00
elif "-Question-" in argument_number:
# special case for two-part Obergefell v. Hodges argument
2018-05-22 01:07:27 +00:00
docket_number = "-".join(argument_number.split("-")[0:2])
2018-05-20 21:45:39 +00:00
else:
docket_number = argument_number
2018-05-20 21:45:39 +00:00
argument_id = argument.a['href'].split("/")[-1]
argument_title = argument.find_all("span")[1].string
argument_date = parser.parse(argument.find_all("td")[1].string).replace(tzinfo=timezone.utc)
add_argument(feed, argument_id, argument_number, argument_title, argument_date, docket_number)
if __name__ == "__main__":
2018-05-22 02:15:20 +00:00
# disable python root logger because of pdfminer spam
# https://stackoverflow.com/questions/29762706/warnings-on-pdfminer
logging.propagate = False
logging.getLogger().setLevel(logging.ERROR)
# argparse
2018-05-20 21:45:39 +00:00
args = argparse.ArgumentParser(description='Generate an RSS feed for a particular term of the court.')
args.add_argument('--term', required=True, help="The term to generate the feed for.")
args.add_argument('--link', required=True, help="The URL of the completed feed.")
args.add_argument('--home', help="The landing page for the source of the audio. Suggested if using a logo.")
args.add_argument('--logo', help="The URL of a logo for the feed.")
arglist = args.parse_args()
TERM = arglist.term
LINK = arglist.link
LOGO = arglist.logo
HOME = arglist.home
content = requests.get("https://www.supremecourt.gov/oral_arguments/argument_audio/" + TERM).content
sessions = BeautifulSoup(content, "html.parser").find_all("table", class_="table table-bordered")
feed = feedbase()
parse_sessions(feed, sessions)
print(feed.rss_str(pretty=True).decode('utf-8'))