Scan for files and parse html to find images

Scan for Files and Parse HTML to Find Images

scan_ext(‘c:\\python36\\tools’, ‘py’)
beer.py
eiffel.py
hanoi.py
life.py
markov.py
mcast.py
queens.py
redemo.py
rpython.py
rpythond.py
sortvisu.py
ss1.py
vector.py
makelocalealias.py
msgfmt.py
pygettext.py
unparse.py
ChipViewer.py
ColorDB.py
DetailsViewer.py
ListViewer.py
Main.py
pyColorChooser.py
PyncheWidget.py
StripViewer.py
Switchboard.py
TextViewer.py
TypeinViewer.py
__init__.py
2to3.py
abitype.py
analyze_dxp.py
byext.py
byteyears.py
checkpip.py
checkpyc.py
cleanfuture.py
combinerefs.py
copytime.py
crlf.py
db2pickle.py
diff.py
dutree.py
eptags.py
find-uname.py
finddiv.py
findlinksto.py
findnocoding.py
find_recursionlimit.py
fixcid.py
fixdiv.py
fixheader.py
fixnotice.py
fixps.py
generate_opcode_h.py
get-remote-certificate.py
google.py
gprof2html.py
h2py.py
highlight.py
ifdef.py
import_diagnostics.py
lfcr.py
linktree.py
lll.py
mailerdaemon.py
make_ctype.py
md5sum.py
mkreal.py
ndiff.py
nm2def.py
objgraph.py
parseentities.py
parse_html5_entities.py
patchcheck.py
pathfix.py
pdeps.py
pickle2db.py
pindent.py
ptags.py
pydoc3.py
pysource.py
pyvenv.py
reindent-rst.py
reindent.py
rgrep.py
run_tests.py
serve.py
suff.py
svneol.py
texi2html.py
treesync.py
untabify.py
which.py
win_add2path.py
print(report(‘c:\\python36\\tools’))
[‘c:\\python36\\tools\\demo\\beer.py’, ‘c:\\python36\\tools\\demo\\eiffel.py’,
‘c:\\python36\\tools\\demo\\hanoi.py’, ‘c:\\python36\\tools\\demo\\life.py’,
‘c:\\python36\\tools\\demo\\markov.py’, ‘c:\\python36\\tools\\demo\\mcast.py’,
‘c:\\python36\\tools\\demo\\queens.py’, ‘c:\\python36\\tools\\demo\\redemo.py’,
‘c:\\python36\\tools\\demo\\rpython.py’, ‘c:\\python36\\tools\\demo\\rpythond.py’,
‘c:\\python36\\tools\\demo\\sortvisu.py’, ‘c:\\python36\\tools\\demo\\ss1.py’,
‘c:\\python36\\tools\\demo\\vector.py’, ‘c:\\python36\\tools\\i18n\\makelocalealias.py’,
‘c:\\python36\\tools\\i18n\\msgfmt.py’, ‘c:\\python36\\tools\\i18n\\pygettext.py’,
‘c:\\python36\\tools\\parser\\unparse.py’, ‘c:\\python36\\tools\\pynche\\ChipViewer.py’,
‘c:\\python36\\tools\\pynche\\ColorDB.py’, ‘c:\\python36\\tools\\pynche\\DetailsViewer.py’,
‘c:\\python36\\tools\\pynche\\ListViewer.py’, ‘c:\\python36\\tools\\pynche\\Main.py’,
‘c:\\python36\\tools\\pynche\\PyncheWidget.py’, ‘c:\\python36\\tools\\pynche\\StripViewer.py’,
‘c:\\python36\\tools\\pynche\\Switchboard.py’, ‘c:\\python36\\tools\\pynche\\TextViewer.py’,
‘c:\\python36\\tools\\pynche\\TypeinViewer.py’, ‘c:\\python36\\tools\\pynche\\__init__.py’,
‘c:\\python36\\tools\\pynche\\html40colors.txt’, ‘c:\\python36\\tools\\pynche\\namedcolors.txt’,
‘c:\\python36\\tools\\pynche\\pyColorChooser.py’, ‘c:\\python36\\tools\\pynche\\pynche.pyw’,
‘c:\\python36\\tools\\pynche\\webcolors.txt’, ‘c:\\python36\\tools\\pynche\\websafe.txt’,
‘c:\\python36\\tools\\scripts\\2to3.py’, ‘c:\\python36\\tools\\scripts\\abitype.py’,
‘c:\\python36\\tools\\scripts\\analyze_dxp.py’, ‘c:\\python36\\tools\\scripts\\byext.py’,
‘c:\\python36\\tools\\scripts\\byteyears.py’, ‘c:\\python36\\tools\\scripts\\checkpip.py’,
‘c:\\python36\\tools\\scripts\\checkpyc.py’, ‘c:\\python36\\tools\\scripts\\cleanfuture.py’,
‘c:\\python36\\tools\\scripts\\combinerefs.py’, ‘c:\\python36\\tools\\scripts\\copytime.py’,
‘c:\\python36\\tools\\scripts\\crlf.py’, ‘c:\\python36\\tools\\scripts\\db2pickle.py’,
‘c:\\python36\\tools\\scripts\\diff.py’, ‘c:\\python36\\tools\\scripts\\dutree.py’,
‘c:\\python36\\tools\\scripts\\eptags.py’, ‘c:\\python36\\tools\\scripts\\find-uname.py’,
‘c:\\python36\\tools\\scripts\\find_recursionlimit.py’, ‘c:\\python36\\tools\\scripts\\finddiv.py’,
‘c:\\python36\\tools\\scripts\\findlinksto.py’, ‘c:\\python36\\tools\\scripts\\findnocoding.py’,
‘c:\\python36\\tools\\scripts\\fixcid.py’, ‘c:\\python36\\tools\\scripts\\fixdiv.py’,
‘c:\\python36\\tools\\scripts\\fixheader.py’, ‘c:\\python36\\tools\\scripts\\fixnotice.py’,
‘c:\\python36\\tools\\scripts\\fixps.py’, ‘c:\\python36\\tools\\scripts\\generate_opcode_h.py’,
‘c:\\python36\\tools\\scripts\\get-remote-certificate.py’, ‘c:\\python36\\tools\\scripts\\google.py’,
‘c:\\python36\\tools\\scripts\\gprof2html.py’, ‘c:\\python36\\tools\\scripts\\h2py.py’,
‘c:\\python36\\tools\\scripts\\highlight.py’, ‘c:\\python36\\tools\\scripts\\ifdef.py’,
‘c:\\python36\\tools\\scripts\\import_diagnostics.py’, ‘c:\\python36\\tools\\scripts\\lfcr.py’,
‘c:\\python36\\tools\\scripts\\linktree.py’, ‘c:\\python36\\tools\\scripts\\lll.py’,
‘c:\\python36\\tools\\scripts\\mailerdaemon.py’, ‘c:\\python36\\tools\\scripts\\make_ctype.py’,
‘c:\\python36\\tools\\scripts\\md5sum.py’, ‘c:\\python36\\tools\\scripts\\mkreal.py’,
‘c:\\python36\\tools\\scripts\\ndiff.py’, ‘c:\\python36\\tools\\scripts\\nm2def.py’,
‘c:\\python36\\tools\\scripts\\objgraph.py’, ‘c:\\python36\\tools\\scripts\\parse_html5_entities.py’,
‘c:\\python36\\tools\\scripts\\parseentities.py’, ‘c:\\python36\\tools\\scripts\\patchcheck.py’,
‘c:\\python36\\tools\\scripts\\pathfix.py’, ‘c:\\python36\\tools\\scripts\\pdeps.py’,
‘c:\\python36\\tools\\scripts\\pickle2db.py’, ‘c:\\python36\\tools\\scripts\\pindent.py’,
‘c:\\python36\\tools\\scripts\\ptags.py’, ‘c:\\python36\\tools\\scripts\\pydoc3.py’,
‘c:\\python36\\tools\\scripts\\pysource.py’, ‘c:\\python36\\tools\\scripts\\pyvenv.py’,
‘c:\\python36\\tools\\scripts\\reindent-rst.py’, ‘c:\\python36\\tools\\scripts\\reindent.py’,
‘c:\\python36\\tools\\scripts\\rgrep.py’, ‘c:\\python36\\tools\\scripts\\run_tests.py’,
‘c:\\python36\\tools\\scripts\\serve.py’, ‘c:\\python36\\tools\\scripts\\suff.py’,
‘c:\\python36\\tools\\scripts\\svneol.py’, ‘c:\\python36\\tools\\scripts\\texi2html.py’,
‘c:\\python36\\tools\\scripts\\treesync.py’, ‘c:\\python36\\tools\\scripts\\untabify.py’,
‘c:\\python36\\tools\\scripts\\which.py’, ‘c:\\python36\\tools\\scripts\\win_add2path.py’,
‘c:\\python36\\tools\\pynche\\X\\rgb.txt’, ‘c:\\python36\\tools\\pynche\\X\\xlicense.txt’]
data(“http://ovid.cs.depaul.edu/Classes/CSC243-F18/firsthtml.html”)
Our first web-pageFirstsectionSubsection 1
This is very important.
Second sentence.
LoremIpsum
A Picture
Go to my web-page
Hello World
images(“http://ovid.cs.depaul.edu/Classes/CSC243-F18/firsthtml.html”)
pic1a.jpg

Solution

########################################################

# Lab assignment 9: Tree recursion and HTML Parsers

# This lab includes some examples of recursion

# over folder structures, extending the scanning examples.

# The last two questions ask you to play with the

# HTML parser and extend its functionality a bit.

########################################################

# Tree recursion and HTML Parsers

# You will find recursion over folder structures.

########################################################

##########################################################################

#1 scan path and everything below it for all files ending in extension #

# ext, e.g. scan(‘E:\\’, ‘txt’) would search for .txt files on the drive#

# E:\ (most likely a USB stick), and scan(‘E:\\’, ‘py’) for .py files. #

# Print the name of all files satisfying the criterion. # #

##########################################################################

fromos import listdir

fromos.path import join,isfile,isdir

defscan_ext(path, ext):

‘print all files with extension ext in or below path’

pass

#Hint: base case as before: path is a file

#Hint 2: for checking extension, endswith may come in handy

#test with a USB stick, or a subfolder of C: which is relatively small

###########################################################################

#2 scan path and everything below it and return a list of all files you #

# found; the list should contain the full paths for every file you found #

###########################################################################

def report(path):

‘report all files in path or below it’

pass

#Hint: accumulate

#Hint 2: urljoin

##########################################################################

#3 HTMLParser to extract page content #

# you want to write a parser and a function that allows you to extract #

# all the text (data) on a web-page at a given url, and simply prints it#

##########################################################################

fromhtml.parser import HTMLParser

fromurllib.request import urlopen

classDataParser(HTMLParser):

‘content parser’

defhandle_starttag(self, tag, attrs):

pass

defhandle_endtag(self, tag):

pass

defhandle_data(self, data):

pass

def data(url):

‘print all the content of the page at url’

pass

#Hint: you need to provide one of the three methods only. Which?

#Hint 2: use print(…, end = ”) to make the output more compact

#you can test with http://ovid.cs.depaul.edu/Classes/CSC243-F18/firsthtml.html

#http://www.yahoo.com or other web-pages

##########################################################################

#4 HTMLParser for images #

# every image tag has a src attribute: #

# <imgsrc = ‘pic1a.jpg’ height = ‘100’> #

# write a function and a parser that print #

# all the sources of images on the web-page at url; for the above #

# image tag, it should print pic1a.jpg #

##########################################################################

fromhtml.parser import HTMLParser

fromurllib.request import urlopen

classImageParser(HTMLParser):

‘image parser’

defhandle_starttag(self, tag, attrs):

pass

defhandle_endtag(self, tag):

pass

defhandle_data(self, data):

pass

def images(url):

‘print all sources of images’

pass

#Hint: src is an attribute, attributes get reported in attrs; in a first step

# printattrs for all image tags; then add code to drill down

# to the srcattribute

Scan for files and parse html to find images

Scan for Files and Parse HTML to Find Images

Assembly Programming Assignment Help

Database Programming Assignment Help

SQL Programming Assignment Help

C Programming Assignment Help

Java Programming Assignment Help

Scala Homework Help

CPP Programming Assignment Help

Python Programming Assignment Help

Prolog Homework Help