利用python爬取cn-ki.net检索结果并写入mysql

准备

所需环境

更改字符编码格式

当出现ERROR 1366 (HY000):Incorrect string value时,需将字符编码格式统一设置为gbk格式。

MYSQL>set character_set_client = 'gbk' ;
MYSQL>set character_set_connection = 'gbk' ;
MYSQL>set character_set_results= 'gbk' ;
MYSQL>set character_set_server= 'gbk' ;
MYSQL>set character_set_database= 'gbk' ;

创建表格

CREATE DATABASE test01;
USE test01;
CREATE TABLE `lunwen` (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `title` varchar(100) DEFAULT NULL,
  `Abstract` varchar(1000) DEFAULT NULL,
   PRIMARY KEY (`id`)
) ;

代码

import requests
import mysql.connector
from lxml import html


KeyWords = '人工智能'       # 搜索关键词
MaxPage = 1                 # 爬取的页面数目
URL = 'https://search.ehn3.com/'

Num_Paper = 0

data = {
    'keyword': KeyWords,
    'db': 'SCDB'
}


def get_html(url, para_data):
    content = requests.get(url, params=para_data)
    return content

# 链接mysql数据库
conn = mysql.connector.connect(user='root', password='LLte6628', database='test01')
cur = conn.cursor()

content = get_html(URL+'search', data)
page_url = content.url

page_ii = 0
while page_ii < MaxPage:
    content = get_html(page_url, '')
    tree = html.fromstring(content.text)

    e1 = tree.xpath('//div[@class="mdui-col-xs-12 mdui-col-md-9 mdui-typo"]')
    for ei in e1:
        e2_title = ei.xpath('h3/a/text()')
        title = ''.join(e2_title)

        e2_author = ei.xpath('div[1]/span[1]/text()')
        author = ''.join(e2_author)
        e2_JnName = ei.xpath('div[1]/span[3]/text()')
        JnName = ''.join(e2_JnName)
        e2_JnVol = ei.xpath('div[1]/span[4]/text()')
        JnVol = ''.join(e2_JnVol)
        e2_JnType = ei.xpath('div[1]/span[5]/text()')
        JnType = ''.join(e2_JnType)

        e2_href = ei.xpath('h3/a/@href')
        href = ''.join(e2_href)
        if URL not in href:
            href = URL + href

        Jn_content = get_html(href, '')
        Jn_tree = html.fromstring(Jn_content.text)

        # 这是之前的代码
        # e2_abstract = Jn_tree.xpath('//div[@class="mdui-col-md-11 mdui-col-xs-9 mdui-text-color-black-text"]/p/text()')
        # 修改后的代码
        e2_abstract = Jn_tree.xpath('//div[@class="mdui-col-xs-12 mdui-text-color-black-text mdui-typo"]/p/text()')

        abstract = (''.join(e2_abstract)).strip()

        print('********************' + href)

        print('title: >>>>>>>>>>> %s' % title)
        print('href: ----------- %s' % href)
        print('author: >>>>>>>>>>> %s' % author)
        print('JnName: ---- %s ---- %s ---- %s' % (JnName, JnVol, JnType))
        print('>>>>>>>>>>> Abstract <<<<<<<<<<<')
        print('%s' % abstract)

        # 保存数据记录
        cur.execute('insert into lunwen (title, Abstract) VALUES (%s, %s)',
                   [ title, abstract])

        Num_Paper += 1

    page_ii += 1

    nextpg = tree.xpath('//div[@class="mdui-col-xs-9 TitleLeftCell mdui-valign"]/a[last()]')
    if nextpg.__len__() == 0:
        break
    nextpg_text = nextpg[0].text
    if nextpg_text == '下一页':
        page_url = nextpg[0].attrib['href']
    else:
        break

    if URL not in page_url:
        page_url = URL + page_url

print('****************************************************************************')
print('>>>>>>>>>>>>>>>>  数据导出结束,共导出 %d 篇文献!<<<<<<<<<<<<<<<<<<<<<<<<<<' % Num_Paper)

# 关闭数据库
conn.commit()
cur.close()

参考资料

[1]. ERROR 1366 (HY000):Incorrect string value解决方案(亲测)

[2]. CNKI网页爬虫