add question presented PDF parsing
This commit is contained in:
parent
e4965998ae
commit
f258091488
50
feed.py
50
feed.py
@ -1,9 +1,16 @@
|
|||||||
|
import io
|
||||||
|
import re
|
||||||
import requests
|
import requests
|
||||||
import argparse
|
import argparse
|
||||||
|
import logging
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from feedgen.feed import FeedGenerator
|
from feedgen.feed import FeedGenerator
|
||||||
from dateutil import parser
|
from dateutil import parser
|
||||||
from datetime import timezone
|
from datetime import timezone
|
||||||
|
from pdfminer.pdfparser import PDFParser, PDFDocument
|
||||||
|
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
|
||||||
|
from pdfminer.converter import PDFPageAggregator
|
||||||
|
from pdfminer.layout import LAParams, LTTextBox, LTTextLine
|
||||||
|
|
||||||
def feedbase():
|
def feedbase():
|
||||||
fg = FeedGenerator()
|
fg = FeedGenerator()
|
||||||
@ -29,14 +36,47 @@ def add_argument(feed, argument_id, argument_number, argument_title, argument_da
|
|||||||
fe.link(href=url)
|
fe.link(href=url)
|
||||||
fe.enclosure('https://www.supremecourt.gov/media/audio/mp3files/' + argument_id + '.mp3', get_filesize(argument_id), 'audio/mpeg')
|
fe.enclosure('https://www.supremecourt.gov/media/audio/mp3files/' + argument_id + '.mp3', get_filesize(argument_id), 'audio/mpeg')
|
||||||
fe.published(argument_date)
|
fe.published(argument_date)
|
||||||
fe.description("The Supreme Court docket for this case is available at https://www.supremecourt.gov/docket/docketfiles/html/public/" + docket_number + ".html.")
|
fe.description(parse_qp(argument_number) + "\nThe Supreme Court docket for this case is available at https://www.supremecourt.gov/docket/docketfiles/html/public/" + docket_number + ".html.")
|
||||||
|
|
||||||
|
def parse_qp(docket_number):
|
||||||
|
if "-Orig" in docket_number:
|
||||||
|
docket = docket_number.split("-")[0] + ' orig'
|
||||||
|
else:
|
||||||
|
split_docket = docket_number.split("-")
|
||||||
|
docket = '{term}-{num:05d}'.format(term=split_docket[0], num=int(split_docket[1]))
|
||||||
|
|
||||||
|
fp = io.BytesIO(requests.get("https://www.supremecourt.gov/qp/" + docket + "qp.pdf").content)
|
||||||
|
parser = PDFParser(fp)
|
||||||
|
doc = PDFDocument()
|
||||||
|
parser.set_document(doc)
|
||||||
|
doc.set_parser(parser)
|
||||||
|
doc.initialize('')
|
||||||
|
rsrcmgr = PDFResourceManager()
|
||||||
|
laparams = LAParams()
|
||||||
|
laparams.char_margin = 1.0
|
||||||
|
laparams.word_margin = 1.0
|
||||||
|
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
|
||||||
|
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
||||||
|
extracted_text = ''
|
||||||
|
|
||||||
|
for page in doc.get_pages():
|
||||||
|
interpreter.process_page(page)
|
||||||
|
layout = device.get_result()
|
||||||
|
for lt_obj in layout:
|
||||||
|
if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
|
||||||
|
text = lt_obj.get_text().replace("(cid:160)", " ")
|
||||||
|
if ("LOWER COURT CASE NUMBER:" not in text) and ("DECISION BELOW:" not in text):
|
||||||
|
extracted_text += text
|
||||||
|
|
||||||
|
return re.sub(' +', ' ', extracted_text)
|
||||||
|
|
||||||
def parse_sessions(feed, sessions):
|
def parse_sessions(feed, sessions):
|
||||||
for session in sessions:
|
for session in sessions:
|
||||||
for argument in session.find_all("tr")[:0:-1]: # pop off the header and invert
|
for argument in session.find_all("tr")[:0:-1]: # pop off the header and invert
|
||||||
argument_number = argument.a.string
|
argument_number = argument.a.string
|
||||||
if "-Orig" in argument_number:
|
if "-Orig" in argument_number:
|
||||||
# magic docket number for now, see https://www.cocklelegalbriefs.com/blog/supreme-court/the-u-s-supreme-courts-use-of-docket-numbers/
|
# magic docket number for now, see
|
||||||
|
# https://www.cocklelegalbriefs.com/blog/supreme-court/the-u-s-supreme-courts-use-of-docket-numbers/
|
||||||
docket_number = "22o" + argument_number.split("-")[0]
|
docket_number = "22o" + argument_number.split("-")[0]
|
||||||
elif "-Question-" in argument_number:
|
elif "-Question-" in argument_number:
|
||||||
# special case for two-part Obergefell v. Hodges argument
|
# special case for two-part Obergefell v. Hodges argument
|
||||||
@ -50,6 +90,12 @@ def parse_sessions(feed, sessions):
|
|||||||
add_argument(feed, argument_id, argument_number, argument_title, argument_date, docket_number)
|
add_argument(feed, argument_id, argument_number, argument_title, argument_date, docket_number)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
# disable python root logger because of pdfminer spam
|
||||||
|
# https://stackoverflow.com/questions/29762706/warnings-on-pdfminer
|
||||||
|
logging.propagate = False
|
||||||
|
logging.getLogger().setLevel(logging.ERROR)
|
||||||
|
|
||||||
|
# argparse
|
||||||
args = argparse.ArgumentParser(description='Generate an RSS feed for a particular term of the court.')
|
args = argparse.ArgumentParser(description='Generate an RSS feed for a particular term of the court.')
|
||||||
args.add_argument('--term', required=True, help="The term to generate the feed for.")
|
args.add_argument('--term', required=True, help="The term to generate the feed for.")
|
||||||
args.add_argument('--link', required=True, help="The URL of the completed feed.")
|
args.add_argument('--link', required=True, help="The URL of the completed feed.")
|
||||||
|
Loading…
Reference in New Issue
Block a user