|
Python语言是更高级的编程语言,能够用极少的代码量编写出其它语言需要数倍代码才能实现的功能,能够灵活引入包实现专属功能,比如操作Excel(通过xlwings)实现VBA宏功能。站长于2020年做了Python语言入门,2年后的2022年在企业内部组织实施HR人力资源管理系统,利用Python语言编写代码读取Excel导入MySQL来实现HR系统数据初始化,Python编程能力得到较大提升。最近整理修改了一个爬虫程序,可以爬取贴吧帖子标题,分享给大家,抛砖引玉。(附代码文件及输出结果)
- # encoding=utf-8
- import urllib
- import urllib.request
- import re
- class teiba():
- def spider(self, name, startPage, endPage):
- url = "https://tieba.baidu.com/f?ie=utf-8&"
- url += urllib.parse.urlencode({"kw": name})
- for page in range(startPage, endPage + 1):
- pn = 50 * (page - 1)
- urlFull = url + "&" + urllib.parse.urlencode({"pn": pn})
- print (urlFull)
- html = self.loadPage(urlFull)
- self.dealPage(html, page)
- def loadPage(self, url):
- header = {
- "User-Agent": " Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36"
- }
- request = urllib.request.Request(url, headers=header)
- response = urllib.request.urlopen(request)
- html = response.read()
- print(html)
- return html
- def dealPage(self, html, page):
- partten = re.compile(
- r'<a\s+rel="noopener"\s+href="/p/\d+"\s+title=".*?"\s+target="_blank" class="j_th_tit\s+">(.*?)</a>',
- re.S)
- html = html.decode('utf-8') # python3
- titleList = partten.findall(html)
- print(titleList)
- rstr = r'<span\s+class="topic-tag"\s+data-name=".*?">#(.*?)#</span>'
- for title in titleList:
- title = re.sub(rstr, "", title)
- #print(title)
- self.writePage(title, page)
- def writePage(self, context, page):
- fileName = "di" + str(page) + "yehtml.txt"
- with open(fileName, "a") as file:
- file.writelines(context + "\n")
- if __name__ == '__main__':
- name = input("请输入贴吧名:")
- startPage = input("请输入起始页:")
- endPage = input("请输入终止页:")
- t = teiba()
- t.spider(name, int(startPage), int(endPage))
复制代码 |
|