From f25809148876a53272ba025e502e521c0ed4d706 Mon Sep 17 00:00:00 2001 From: David Ashby Date: Mon, 21 May 2018 22:15:20 -0400 Subject: [PATCH] add question presented PDF parsing --- feed.py | 50 ++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 48 insertions(+), 2 deletions(-) diff --git a/feed.py b/feed.py index d8ca505..89ad533 100755 --- a/feed.py +++ b/feed.py @@ -1,9 +1,16 @@ +import io +import re import requests import argparse +import logging from bs4 import BeautifulSoup from feedgen.feed import FeedGenerator from dateutil import parser from datetime import timezone +from pdfminer.pdfparser import PDFParser, PDFDocument +from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter +from pdfminer.converter import PDFPageAggregator +from pdfminer.layout import LAParams, LTTextBox, LTTextLine def feedbase(): fg = FeedGenerator() @@ -29,14 +36,47 @@ def add_argument(feed, argument_id, argument_number, argument_title, argument_da fe.link(href=url) fe.enclosure('https://www.supremecourt.gov/media/audio/mp3files/' + argument_id + '.mp3', get_filesize(argument_id), 'audio/mpeg') fe.published(argument_date) - fe.description("The Supreme Court docket for this case is available at https://www.supremecourt.gov/docket/docketfiles/html/public/" + docket_number + ".html.") + fe.description(parse_qp(argument_number) + "\nThe Supreme Court docket for this case is available at https://www.supremecourt.gov/docket/docketfiles/html/public/" + docket_number + ".html.") + +def parse_qp(docket_number): + if "-Orig" in docket_number: + docket = docket_number.split("-")[0] + ' orig' + else: + split_docket = docket_number.split("-") + docket = '{term}-{num:05d}'.format(term=split_docket[0], num=int(split_docket[1])) + + fp = io.BytesIO(requests.get("https://www.supremecourt.gov/qp/" + docket + "qp.pdf").content) + parser = PDFParser(fp) + doc = PDFDocument() + parser.set_document(doc) + doc.set_parser(parser) + doc.initialize('') + rsrcmgr = PDFResourceManager() + laparams = LAParams() + laparams.char_margin = 1.0 + laparams.word_margin = 1.0 + device = PDFPageAggregator(rsrcmgr, laparams=laparams) + interpreter = PDFPageInterpreter(rsrcmgr, device) + extracted_text = '' + + for page in doc.get_pages(): + interpreter.process_page(page) + layout = device.get_result() + for lt_obj in layout: + if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): + text = lt_obj.get_text().replace("(cid:160)", " ") + if ("LOWER COURT CASE NUMBER:" not in text) and ("DECISION BELOW:" not in text): + extracted_text += text + + return re.sub(' +', ' ', extracted_text) def parse_sessions(feed, sessions): for session in sessions: for argument in session.find_all("tr")[:0:-1]: # pop off the header and invert argument_number = argument.a.string if "-Orig" in argument_number: - # magic docket number for now, see https://www.cocklelegalbriefs.com/blog/supreme-court/the-u-s-supreme-courts-use-of-docket-numbers/ + # magic docket number for now, see + # https://www.cocklelegalbriefs.com/blog/supreme-court/the-u-s-supreme-courts-use-of-docket-numbers/ docket_number = "22o" + argument_number.split("-")[0] elif "-Question-" in argument_number: # special case for two-part Obergefell v. Hodges argument @@ -50,6 +90,12 @@ def parse_sessions(feed, sessions): add_argument(feed, argument_id, argument_number, argument_title, argument_date, docket_number) if __name__ == "__main__": + # disable python root logger because of pdfminer spam + # https://stackoverflow.com/questions/29762706/warnings-on-pdfminer + logging.propagate = False + logging.getLogger().setLevel(logging.ERROR) + + # argparse args = argparse.ArgumentParser(description='Generate an RSS feed for a particular term of the court.') args.add_argument('--term', required=True, help="The term to generate the feed for.") args.add_argument('--link', required=True, help="The URL of the completed feed.")