核心思路是使用正則表達式對網(wǎng)頁的html5中的路徑名和文件名進行抓取, 然后對路徑繼續(xù)進行同樣的抓取,用遞歸的方式進行搜索。最后把網(wǎng)站上的內(nèi)容文件全部下載下來 import urllib import sysimport BeautifulSoup import re import os path = [] def extract(url): content = urllib.urlopen(url).read() #reg = r'(?:href|HREF)="?((?:http://)?.+?\.txt)' reg = r'<a href="(.*)">.*' url_re = re.compile(reg) url_lst = re.findall(url_re, content) for lst in url_lst: ext = lst.split('.')[-1] if ext[-1] == '/': newUrl = url + lst extract(newUrl) else: path.append(url + lst) print "downloading with urllib" url = 'http://139.196.233.65/js/' extract(url) filePath = 'E:/6-學習文檔/91-JS/Download/js' filePath = unicode(filePath, 'utf8') for p in path: fileTitle = p.split('/js')[-1] file = filePath + fileTitle dir = os.path.dirname(file) isExists=os.path.exists(dir) if isExists == False: os.makedirs(dir) urllib.urlretrieve(p, file) #for lst in url_lst: # file = filePath + lst # lst = url + '/' + lst # urllib.urlretrieve(lst, file) |
|
來自: LibraryPKU > 《Python》