一、基本认识
import requests
if __name__ == "__main__":
# 获取一个get请求
response = requests.get('http://httpbin.org/get')
import requests
if __name__ == "__main__":
# 获取一个get请求
response = requests.get('http://httpbin.org/get')
# 对抓取的网站设置编码
response.encoding = 'utf-8'
# 打印返回的数据
print(response.text)
print(response.json())
print(response.headers)
print(response.status_code)
print(response.url)
print(response.cookies)
print(response.json())
# 获取最原始的字符串,没有编码的(用户response.text出现乱码的时候,及下载二进制文件的时候)
print(response.content)
response = requests.post('http://httpbin.org/post')
response = requests.put('http://httpbin.org/put')
response = requests.delete('http://httpbin.org/delete')
response = requests.head('http://httpbin.org/get')
response = requests.options('http://httpbin.org/get')
二、关于get
请求传递参数的
import requests
if __name__ == "__main__":
# 定义一个请求头(模拟浏览器)
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'}
# 设置参数
data = {'name': 'june', 'password': 123456}
# 获取一个get请求
response = requests.get('http://httpbin.org/get?name=june&password=123456', headers=headers)
# 对抓取的网站设置编码
response.encoding = 'utf-8'
print(response.text)
import requests
if __name__ == "__main__":
# 定义一个请求头(模拟浏览器)
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'}
# 设置参数
data = {'name': 'june', 'password': 123456}
# 获取一个get请求
response = requests.get('http://httpbin.org/get', headers=headers, params=data)
# 对抓取的网站设置编码
response.encoding = 'utf-8'
print(response.text)
三、使用requests
库和正则表达式下载文章内容
- 1、需要下载的伯乐在线的文章标题
- 2、书写逻辑代码
import re
import requests
if __name__ == "__main__":
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36',
}
url = 'http://python.jobbole.com/category/guide/'
response = requests.get(url=url, headers=headers)
pattern = re.compile(
'<div.*?post-thumb.*?title="(.*?)".*?</a>', re.S
)
print(response.status_code)
result_list = re.findall(pattern, response.text)
f = open('jobbole1.txt', 'a+', encoding='utf8')
for item in result_list:
f.write(item.strip() + '\n')
f.close()
- 3、解说正则表达式
- .*?表示非贪婪的匹配任何字符
- re.S 使.匹配包括换行在内的全部字符
四、使用requests
库和正则表达式下载图片
import re
import os
import shutil
import requests
class DownPic(object):
def __init__(self):
self.url = 'http://python.jobbole.com/category/guide/'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36',
}
self.create_dir()
def create_dir(cls):
# 如果文件夹存在就删除
if os.path.exists('demo'):
shutil.rmtree('demo')
os.makedirs('demo')
def get_html(self):
response = requests.get(url=self.url, headers=self.headers)
return response.text
def pattern(self):
pattern = re.compile(
'<div.*?post-thumb.*?src="(.*?)".*?</a>', re.S
)
result_list = re.findall(pattern, self.get_html())
return result_list
def download(self):
for item in self.pattern():
# 获取到的图片地址再次请求
if item.rsplit('.')[-1] in ['png', 'jpg']:
resp = requests.get(item.strip())
try:
with open(os.path.join('demo', item.strip().rsplit("/")[-1]), 'wb') as f:
f.write(resp.content)
except Exception as e:
print(e)
else:
continue
if __name__ == "__main__":
p = DownPic()
p.download()
五、关于requests
库的post
请求
response = requests.post('http://httpbin.org/post', headers=headers, data=data)
import requests
if __name__ == "__main__":
# 定义一个请求头(模拟浏览器)
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'}
# 设置参数
data = {'email': '22@qq.com', 'password': 123456}
# 获取一个get请求
response = requests.post('https://httpbin.org/post', headers=headers, data=data)
# 对抓取的网站设置编码
response.encoding = 'utf-8'
print(response.text)
六、使用post
请求获取拉勾网职业信息
import requests
if __name__ == "__main__":
url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false'
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36',
'Referer': 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput='
}
data = {
'first': 'true',
'pn': '1',
'kd': 'python',
}
response = requests.post(url=url, headers=headers, data=data)
print(response.json())