Scan for files and parse html to find images

Scan for Files and Parse HTML to Find Images

scan_ext(‘c:\\python36\\tools’, ‘py’)
beer.py
eiffel.py
hanoi.py
life.py
markov.py
mcast.py
queens.py
redemo.py
rpython.py
rpythond.py
sortvisu.py
ss1.py
vector.py
makelocalealias.py
msgfmt.py
pygettext.py
unparse.py
ChipViewer.py
ColorDB.py
DetailsViewer.py
ListViewer.py
Main.py
pyColorChooser.py
PyncheWidget.py
StripViewer.py
Switchboard.py
TextViewer.py
TypeinViewer.py
__init__.py
2to3.py
abitype.py
analyze_dxp.py
byext.py
byteyears.py
checkpip.py
checkpyc.py
cleanfuture.py
combinerefs.py
copytime.py
crlf.py
db2pickle.py
diff.py
dutree.py
eptags.py
find-uname.py
finddiv.py
findlinksto.py
findnocoding.py
find_recursionlimit.py
fixcid.py
fixdiv.py
fixheader.py
fixnotice.py
fixps.py
generate_opcode_h.py
get-remote-certificate.py
google.py
gprof2html.py
h2py.py
highlight.py
ifdef.py
import_diagnostics.py
lfcr.py
linktree.py
lll.py
mailerdaemon.py
make_ctype.py
md5sum.py
mkreal.py
ndiff.py
nm2def.py
objgraph.py
parseentities.py
parse_html5_entities.py
patchcheck.py
pathfix.py
pdeps.py
pickle2db.py
pindent.py
ptags.py
pydoc3.py
pysource.py
pyvenv.py
reindent-rst.py
reindent.py
rgrep.py
run_tests.py
serve.py
suff.py
svneol.py
texi2html.py
treesync.py
untabify.py
which.py
win_add2path.py
print(report(‘c:\\python36\\tools’))
[‘c:\\python36\\tools\\demo\\beer.py’, ‘c:\\python36\\tools\\demo\\eiffel.py’,
‘c:\\python36\\tools\\demo\\hanoi.py’, ‘c:\\python36\\tools\\demo\\life.py’,
‘c:\\python36\\tools\\demo\\markov.py’, ‘c:\\python36\\tools\\demo\\mcast.py’,
‘c:\\python36\\tools\\demo\\queens.py’, ‘c:\\python36\\tools\\demo\\redemo.py’,
‘c:\\python36\\tools\\demo\\rpython.py’, ‘c:\\python36\\tools\\demo\\rpythond.py’,
‘c:\\python36\\tools\\demo\\sortvisu.py’, ‘c:\\python36\\tools\\demo\\ss1.py’,
‘c:\\python36\\tools\\demo\\vector.py’, ‘c:\\python36\\tools\\i18n\\makelocalealias.py’,
‘c:\\python36\\tools\\i18n\\msgfmt.py’, ‘c:\\python36\\tools\\i18n\\pygettext.py’,
‘c:\\python36\\tools\\parser\\unparse.py’, ‘c:\\python36\\tools\\pynche\\ChipViewer.py’,
‘c:\\python36\\tools\\pynche\\ColorDB.py’, ‘c:\\python36\\tools\\pynche\\DetailsViewer.py’,
‘c:\\python36\\tools\\pynche\\ListViewer.py’, ‘c:\\python36\\tools\\pynche\\Main.py’,
‘c:\\python36\\tools\\pynche\\PyncheWidget.py’, ‘c:\\python36\\tools\\pynche\\StripViewer.py’,
‘c:\\python36\\tools\\pynche\\Switchboard.py’, ‘c:\\python36\\tools\\pynche\\TextViewer.py’,
‘c:\\python36\\tools\\pynche\\TypeinViewer.py’, ‘c:\\python36\\tools\\pynche\\__init__.py’,
‘c:\\python36\\tools\\pynche\\html40colors.txt’, ‘c:\\python36\\tools\\pynche\\namedcolors.txt’,
‘c:\\python36\\tools\\pynche\\pyColorChooser.py’, ‘c:\\python36\\tools\\pynche\\pynche.pyw’,
‘c:\\python36\\tools\\pynche\\webcolors.txt’, ‘c:\\python36\\tools\\pynche\\websafe.txt’,
‘c:\\python36\\tools\\scripts\\2to3.py’, ‘c:\\python36\\tools\\scripts\\abitype.py’,
‘c:\\python36\\tools\\scripts\\analyze_dxp.py’, ‘c:\\python36\\tools\\scripts\\byext.py’,
‘c:\\python36\\tools\\scripts\\byteyears.py’, ‘c:\\python36\\tools\\scripts\\checkpip.py’,
‘c:\\python36\\tools\\scripts\\checkpyc.py’, ‘c:\\python36\\tools\\scripts\\cleanfuture.py’,
‘c:\\python36\\tools\\scripts\\combinerefs.py’, ‘c:\\python36\\tools\\scripts\\copytime.py’,
‘c:\\python36\\tools\\scripts\\crlf.py’, ‘c:\\python36\\tools\\scripts\\db2pickle.py’,
‘c:\\python36\\tools\\scripts\\diff.py’, ‘c:\\python36\\tools\\scripts\\dutree.py’,
‘c:\\python36\\tools\\scripts\\eptags.py’, ‘c:\\python36\\tools\\scripts\\find-uname.py’,
‘c:\\python36\\tools\\scripts\\find_recursionlimit.py’, ‘c:\\python36\\tools\\scripts\\finddiv.py’,
‘c:\\python36\\tools\\scripts\\findlinksto.py’, ‘c:\\python36\\tools\\scripts\\findnocoding.py’,
‘c:\\python36\\tools\\scripts\\fixcid.py’, ‘c:\\python36\\tools\\scripts\\fixdiv.py’,
‘c:\\python36\\tools\\scripts\\fixheader.py’, ‘c:\\python36\\tools\\scripts\\fixnotice.py’,
‘c:\\python36\\tools\\scripts\\fixps.py’, ‘c:\\python36\\tools\\scripts\\generate_opcode_h.py’,
‘c:\\python36\\tools\\scripts\\get-remote-certificate.py’, ‘c:\\python36\\tools\\scripts\\google.py’,
‘c:\\python36\\tools\\scripts\\gprof2html.py’, ‘c:\\python36\\tools\\scripts\\h2py.py’,
‘c:\\python36\\tools\\scripts\\highlight.py’, ‘c:\\python36\\tools\\scripts\\ifdef.py’,
‘c:\\python36\\tools\\scripts\\import_diagnostics.py’, ‘c:\\python36\\tools\\scripts\\lfcr.py’,
‘c:\\python36\\tools\\scripts\\linktree.py’, ‘c:\\python36\\tools\\scripts\\lll.py’,
‘c:\\python36\\tools\\scripts\\mailerdaemon.py’, ‘c:\\python36\\tools\\scripts\\make_ctype.py’,
‘c:\\python36\\tools\\scripts\\md5sum.py’, ‘c:\\python36\\tools\\scripts\\mkreal.py’,
‘c:\\python36\\tools\\scripts\\ndiff.py’, ‘c:\\python36\\tools\\scripts\\nm2def.py’,
‘c:\\python36\\tools\\scripts\\objgraph.py’, ‘c:\\python36\\tools\\scripts\\parse_html5_entities.py’,
‘c:\\python36\\tools\\scripts\\parseentities.py’, ‘c:\\python36\\tools\\scripts\\patchcheck.py’,
‘c:\\python36\\tools\\scripts\\pathfix.py’, ‘c:\\python36\\tools\\scripts\\pdeps.py’,
‘c:\\python36\\tools\\scripts\\pickle2db.py’, ‘c:\\python36\\tools\\scripts\\pindent.py’,
‘c:\\python36\\tools\\scripts\\ptags.py’, ‘c:\\python36\\tools\\scripts\\pydoc3.py’,
‘c:\\python36\\tools\\scripts\\pysource.py’, ‘c:\\python36\\tools\\scripts\\pyvenv.py’,
‘c:\\python36\\tools\\scripts\\reindent-rst.py’, ‘c:\\python36\\tools\\scripts\\reindent.py’,
‘c:\\python36\\tools\\scripts\\rgrep.py’, ‘c:\\python36\\tools\\scripts\\run_tests.py’,
‘c:\\python36\\tools\\scripts\\serve.py’, ‘c:\\python36\\tools\\scripts\\suff.py’,
‘c:\\python36\\tools\\scripts\\svneol.py’, ‘c:\\python36\\tools\\scripts\\texi2html.py’,
‘c:\\python36\\tools\\scripts\\treesync.py’, ‘c:\\python36\\tools\\scripts\\untabify.py’,
‘c:\\python36\\tools\\scripts\\which.py’, ‘c:\\python36\\tools\\scripts\\win_add2path.py’,
‘c:\\python36\\tools\\pynche\\X\\rgb.txt’, ‘c:\\python36\\tools\\pynche\\X\\xlicense.txt’]
data(“http://ovid.cs.depaul.edu/Classes/CSC243-F18/firsthtml.html”)
Our first web-pageFirstsectionSubsection 1
This is very important.
Second sentence.
LoremIpsum
A Picture
Go to my web-page
Hello World
images(“http://ovid.cs.depaul.edu/Classes/CSC243-F18/firsthtml.html”)
pic1a.jpg

Solution

 ########################################################

# Lab assignment 9: Tree recursion and HTML Parsers

#

# This lab includes some examples of recursion

# over folder structures, extending the scanning examples.

# The last two questions ask you to play with the

# HTML parser and extend its functionality a bit.

#

########################################################

#

# Tree recursion and HTML Parsers

#

# You will find recursion over folder structures.

#

########################################################

##########################################################################

#1 scan path and everything below it for all files ending in extension   #

#  ext, e.g. scan(‘E:\\’, ‘txt’) would search for .txt files on the drive#

#  E:\ (most likely a USB stick), and scan(‘E:\\’, ‘py’) for .py files.  #

#  Print the name of all files satisfying the criterion.                 #                                                        #

##########################################################################

fromos import listdir

fromos.path import join,isfile,isdir

defscan_ext(path, ext):

‘print all files with extension ext in or below path’

pass

#Hint: base case as before: path is a file

#Hint 2: for checking extension, endswith may come in handy

#test with a USB stick, or a subfolder of C: which is relatively small

###########################################################################

#2 scan path and everything below it and return a list of all files you   #

#  found; the list should contain the full paths for every file you found #

###########################################################################

def report(path):

‘report all files in path or below it’

pass

#Hint: accumulate

#Hint 2: urljoin

##########################################################################

#3 HTMLParser to extract page content                                    #

#  you want to write a parser and a function that allows you to extract  #

#  all the text (data) on a web-page at a given url, and simply prints it#

##########################################################################

fromhtml.parser import HTMLParser

fromurllib.request import urlopen

classDataParser(HTMLParser):

‘content parser’

defhandle_starttag(self, tag, attrs):

pass

defhandle_endtag(self, tag):

pass

defhandle_data(self, data):

pass

def data(url):

‘print all the content of the page at url’

pass

#Hint: you need to provide one of the three methods only. Which?

#Hint 2: use print(…, end = ”) to make the output more compact

#

#you can test with http://ovid.cs.depaul.edu/Classes/CSC243-F18/firsthtml.html

#http://www.yahoo.com or other web-pages

##########################################################################

#4 HTMLParser for images                                                 #

#  every image tag has a src attribute:                                  #

#  <imgsrc = ‘pic1a.jpg’ height = ‘100’>                                #

#  write a function and a parser that print                              #

#  all the sources of images on the web-page at url; for the above       #

#  image tag, it should print pic1a.jpg                                  #

##########################################################################

fromhtml.parser import HTMLParser

fromurllib.request import urlopen

classImageParser(HTMLParser):

‘image parser’

defhandle_starttag(self, tag, attrs):

pass

defhandle_endtag(self, tag):

pass

defhandle_data(self, data):

pass

def images(url):

‘print all sources of images’

pass

#Hint: src is an attribute, attributes get reported in attrs; in a first step

#      printattrs for all image tags; then add code to drill down

#      to the srcattribute

#