import urllib.request,re def getNovelContent(): html = urllib.request.urlopen('http://www.quanshuwang.com/book/0/742').read() html = html.decode('gbk') #print(html) ###.*?是匹配所有的,加括号是我们想要的,放进列表里面### reg = r'<li><a href="(.*?)" title=".*?">(.*?)</a></li>' #增加匹配效率 reg = re.compile(reg) urls = re.findall(reg,html) for url in urls: novel_url = url[0] novel_title = url[1] chapt = urllib.request.urlopen(novel_url).read() chapt_html = chapt.decode('gbk') reg = r' (.*?)<script type="text/javascript">' reg = re.compile(reg,re.S) chapt_content = re.findall(reg,chapt_html) chapt_content = chapt_content[0].replace(" ","") #print(chapt_content) chapt_content = chapt_content.replace("<br />", "") #print(chapt_content) print('正在保存%s'%novel_title) f = open('{}.txt'.format(novel_title),'w') f.write(chapt_content) f.close() getNovelContent()