2018-05-22 02:15:20 +00:00
import io
import re
2018-05-20 21:45:39 +00:00
import requests
import argparse
2018-05-22 02:15:20 +00:00
import logging
2018-05-20 21:45:39 +00:00
from bs4 import BeautifulSoup
from feedgen . feed import FeedGenerator
from dateutil import parser
from datetime import timezone
2018-05-22 02:15:20 +00:00
from pdfminer . pdfparser import PDFParser , PDFDocument
from pdfminer . pdfinterp import PDFResourceManager , PDFPageInterpreter
from pdfminer . converter import PDFPageAggregator
from pdfminer . layout import LAParams , LTTextBox , LTTextLine
2018-05-20 21:45:39 +00:00
def feedbase ( ) :
fg = FeedGenerator ( )
fg . load_extension ( ' podcast ' )
2018-05-22 00:50:00 +00:00
fg . title ( ' SCOTUS Audio ' + TERM + ' Term ' )
2018-05-20 21:45:39 +00:00
fg . subtitle ( ' This is an automated feed of the mp3 files from the SCOTUS website. NOT AFFILIATED WITH THE COURT OR THE JUSTICES. ' )
fg . link ( href = LINK , rel = ' self ' )
fg . language ( ' en ' )
if HOME :
fg . link ( href = HOME , rel = ' alternate ' )
if LOGO :
fg . logo ( LOGO )
return fg
def get_filesize ( argument_id ) :
return requests . head ( ' https://www.supremecourt.gov/media/audio/mp3files/ ' + argument_id + ' .mp3 ' ) . headers [ ' content-length ' ]
def add_argument ( feed , argument_id , argument_number , argument_title , argument_date , docket_number ) :
fe = feed . add_entry ( order = ' append ' )
url = " https://www.supremecourt.gov/oral_arguments/audio/ " + TERM + " / " + argument_id
fe . id ( url )
fe . title ( argument_number + " : " + argument_title )
fe . link ( href = url )
fe . enclosure ( ' https://www.supremecourt.gov/media/audio/mp3files/ ' + argument_id + ' .mp3 ' , get_filesize ( argument_id ) , ' audio/mpeg ' )
fe . published ( argument_date )
2018-05-22 02:15:20 +00:00
fe . description ( parse_qp ( argument_number ) + " \n The Supreme Court docket for this case is available at https://www.supremecourt.gov/docket/docketfiles/html/public/ " + docket_number + " .html. " )
def parse_qp ( docket_number ) :
if " -Orig " in docket_number :
docket = docket_number . split ( " - " ) [ 0 ] + ' orig '
else :
split_docket = docket_number . split ( " - " )
docket = ' {term} - {num:05d} ' . format ( term = split_docket [ 0 ] , num = int ( split_docket [ 1 ] ) )
fp = io . BytesIO ( requests . get ( " https://www.supremecourt.gov/qp/ " + docket + " qp.pdf " ) . content )
parser = PDFParser ( fp )
doc = PDFDocument ( )
parser . set_document ( doc )
doc . set_parser ( parser )
doc . initialize ( ' ' )
rsrcmgr = PDFResourceManager ( )
laparams = LAParams ( )
laparams . char_margin = 1.0
laparams . word_margin = 1.0
device = PDFPageAggregator ( rsrcmgr , laparams = laparams )
interpreter = PDFPageInterpreter ( rsrcmgr , device )
extracted_text = ' '
for page in doc . get_pages ( ) :
interpreter . process_page ( page )
layout = device . get_result ( )
for lt_obj in layout :
if isinstance ( lt_obj , LTTextBox ) or isinstance ( lt_obj , LTTextLine ) :
text = lt_obj . get_text ( ) . replace ( " (cid:160) " , " " )
if ( " LOWER COURT CASE NUMBER: " not in text ) and ( " DECISION BELOW: " not in text ) :
extracted_text + = text
return re . sub ( ' + ' , ' ' , extracted_text )
2018-05-20 21:45:39 +00:00
def parse_sessions ( feed , sessions ) :
for session in sessions :
for argument in session . find_all ( " tr " ) [ : 0 : - 1 ] : # pop off the header and invert
argument_number = argument . a . string
2018-05-22 00:48:50 +00:00
if " -Orig " in argument_number :
2018-05-22 02:15:20 +00:00
# magic docket number for now, see
# https://www.cocklelegalbriefs.com/blog/supreme-court/the-u-s-supreme-courts-use-of-docket-numbers/
2018-05-22 00:48:50 +00:00
docket_number = " 22o " + argument_number . split ( " - " ) [ 0 ]
2018-05-22 01:07:27 +00:00
elif " -Question- " in argument_number :
2018-05-22 00:48:50 +00:00
# special case for two-part Obergefell v. Hodges argument
2018-05-22 01:07:27 +00:00
docket_number = " - " . join ( argument_number . split ( " - " ) [ 0 : 2 ] )
2018-05-20 21:45:39 +00:00
else :
docket_number = argument_number
2018-05-22 00:48:50 +00:00
2018-05-20 21:45:39 +00:00
argument_id = argument . a [ ' href ' ] . split ( " / " ) [ - 1 ]
argument_title = argument . find_all ( " span " ) [ 1 ] . string
argument_date = parser . parse ( argument . find_all ( " td " ) [ 1 ] . string ) . replace ( tzinfo = timezone . utc )
add_argument ( feed , argument_id , argument_number , argument_title , argument_date , docket_number )
if __name__ == " __main__ " :
2018-05-22 02:15:20 +00:00
# disable python root logger because of pdfminer spam
# https://stackoverflow.com/questions/29762706/warnings-on-pdfminer
logging . propagate = False
logging . getLogger ( ) . setLevel ( logging . ERROR )
# argparse
2018-05-20 21:45:39 +00:00
args = argparse . ArgumentParser ( description = ' Generate an RSS feed for a particular term of the court. ' )
args . add_argument ( ' --term ' , required = True , help = " The term to generate the feed for. " )
args . add_argument ( ' --link ' , required = True , help = " The URL of the completed feed. " )
args . add_argument ( ' --home ' , help = " The landing page for the source of the audio. Suggested if using a logo. " )
args . add_argument ( ' --logo ' , help = " The URL of a logo for the feed. " )
arglist = args . parse_args ( )
TERM = arglist . term
LINK = arglist . link
LOGO = arglist . logo
HOME = arglist . home
content = requests . get ( " https://www.supremecourt.gov/oral_arguments/argument_audio/ " + TERM ) . content
sessions = BeautifulSoup ( content , " html.parser " ) . find_all ( " table " , class_ = " table table-bordered " )
feed = feedbase ( )
parse_sessions ( feed , sessions )
print ( feed . rss_str ( pretty = True ) . decode ( ' utf-8 ' ) )