Python實(shí)現(xiàn)搜索Google Scholar論文信息的示例代碼
示例數(shù)據(jù)
示例代碼
import requests from bs4 import BeautifulSoup from tqdm import tqdm from pybtex.database import BibliographyData, Entry from pybtex.database.input import bibtex import pandas as pd import time import json import random def search_doi(doi): '''根據(jù)doi查論文詳細(xì)信息''' url = f'https://api.crossref.org/works/{doi}' response = requests.get(url) result = None if response.status_code == 200: result = response.json()['message'] else: print('Error occurred') return result # doi = 'https://dl.acm.org/doi/abs/10.1145/3394486.3403237' # result = search_doi(doi) # print(f"Title: {result['title'][0]}:{result['subtitle'][0]}") # print(f"Author(s): {', '.join(author['given'] + ' ' + author['family'] for author in result['author'])}") # print(f"Journal: {result['container-title'][0]}") # print(f"Publication Date: {result['published-print']['date-parts'][0][0]}") def search_cite(atid): '''根據(jù)atid查cite''' url = f'https://scholar.google.com/scholar?q=info:{atid}:scholar.google.com/&output=cite&scirp=8&hl=zh-CN' resp = requests.get(url) soup = BeautifulSoup(resp.text, 'lxml') result = {} for item in soup.find_all('tr'): cith = item.find('th', class_='gs_cith').getText() citr = item.find('div', class_='gs_citr').getText() result[cith] = citr return result # result = search_cite('_goqYZv1zjMJ') # print(result) # 更改節(jié)點(diǎn)配置 def change_clash_node(node_name=None): # Clash API的URL和密碼 url = 'http://127.0.0.1:15043/proxies/??國(guó)外流量' password = 'ee735f4e-59c6-4d60-a2ad-aabd075badb2' local_node_name = ['香港1-IEPL-倍率1.0', '香港2-IEPL-倍率1.0', '香港3-IEPL-倍率1.0', '臺(tái)灣1-IEPL-倍率1.0', '臺(tái)灣2-IEPL-倍率1.0', '臺(tái)灣3-IEPL-倍率1.0', '新加坡1-IEPL-倍率1.0', '新加坡2-IEPL-倍率1.0', '新加坡3-IEPL-倍率1.0' ] node_name = node_name or random.choice(local_node_name) print(f'當(dāng)前選擇節(jié)點(diǎn)名稱: {node_name}') headers = {'Authorization': password} data = { 'name': 'Rule', 'type': 'Selector', 'now': node_name } response = requests.put(url, headers=headers, json=data) if response.status_code == 200: print('節(jié)點(diǎn)已更改為:', node_name) else: print('更改節(jié)點(diǎn)時(shí)出錯(cuò):', response.text) # 更改節(jié)點(diǎn)為my_node # change_clash_node() def proxy_requests(url): proxies = { 'http': 'socks5://127.0.0.1:7890', 'https': 'socks5://127.0.0.1:7890' } return requests.get(url, proxies=proxies) def search(title='GNN', start=0): url = f'https://scholar.google.com/scholar?start={start}&q=allintitle:+{title}&hl=zh-CN&as_sdt=0,5' resp = proxy_requests(url) soup = BeautifulSoup(resp.text, 'lxml') try: papers_item = soup.find(id='gs_res_ccl_mid').find_all('div', class_='gs_scl') except: print(soup) if 'captcha-form' in soup: return -1 papers_info = [] for paper in papers_item: publisher = paper.find('div', class_='gs_or_ggsm').getText().split()[1].split('.')[0] href = paper.find('h3', class_='gs_rt').find('a').get('href') title = paper.find('h3', class_='gs_rt').find('a').getText() detail = paper.find('div', class_='gs_ri').find('div', class_='gs_a').getText() year = detail.split(',')[-1].strip()[:4] # atid = paper.find('h3', class_='gs_rt').find('a').get('data-clk-atid') # cite_info = search_cite(atid)['MLA'] # cite_info_filter = list(filter(lambda x:x, map(lambda x:x.strip().strip('"').strip(), cite_info.strip().split('.')))) # author, title, publisher, year = cite_info_filter papers_info.append({'title':title, 'year':year, 'publisher':publisher, 'href':href}) return papers_info index_start = 0 index_end = 500 index_gap = 10 papers_store = [] bar = tqdm(total=index_end-index_start, desc=f'From {index_start} to {index_end}') # for start in range(index_start, index_end, index_gap): while index_start < index_end: try: papers_info = search(title='GNN', start=index_start) if papers_info == -1: print('需要驗(yàn)證碼,更換節(jié)點(diǎn)后2秒內(nèi)重試') change_clash_node() time.sleep(2) continue papers_store.extend(papers_info) except AttributeError as e: print(e) break index_start += index_gap bar.update(index_gap) bar.refresh() time.sleep(0.1) bar.close() df = pd.DataFrame(papers_info) print(df) df.to_csv('data.csv', index=False)
以上就是Python實(shí)現(xiàn)搜索Google Scholar論文信息的示例代碼的詳細(xì)內(nèi)容,更多關(guān)于Python搜索Google Scholar論文信息的資料請(qǐng)關(guān)注腳本之家其它相關(guān)文章!
相關(guān)文章
python之基數(shù)排序的實(shí)現(xiàn)
這篇文章主要介紹了python之基數(shù)排序的實(shí)現(xiàn),本篇文章通過簡(jiǎn)要的案例,講解了該項(xiàng)技術(shù)的了解與使用,以下就是詳細(xì)內(nèi)容,需要的朋友可以參考下2021-07-07python 生成正態(tài)分布數(shù)據(jù),并繪圖和解析
這篇文章主要介紹了python 生成正態(tài)分布數(shù)據(jù),并繪圖和解析,幫助大家更好的利用python進(jìn)行數(shù)據(jù)分析,感興趣的朋友可以了解下2020-12-12Anaconda修改默認(rèn)虛擬環(huán)境安裝位置的方案分享
新安裝Anaconda后,在創(chuàng)建環(huán)境時(shí)環(huán)境自動(dòng)安裝在C盤,但是C盤空間有限,下面這篇文章主要給大家介紹了關(guān)于Anaconda修改默認(rèn)虛擬環(huán)境安裝位置的相關(guān)資料,需要的朋友可以參考下2023-01-01python中(負(fù)數(shù))整除和取模運(yùn)算方式
Python中的取模運(yùn)算符是%,它與其他語(yǔ)言中的取余符號(hào)相同,整除運(yùn)算符是//,表示向下取整,在Python中,正數(shù)的取余和取模結(jié)果相同,但負(fù)數(shù)的取余和取模結(jié)果有所不同,取余運(yùn)算在計(jì)算時(shí)向0方向舍棄小數(shù)位,而取模運(yùn)算向負(fù)無(wú)窮方向舍棄小數(shù)位2024-10-10使用Python實(shí)現(xiàn)ELT統(tǒng)計(jì)多個(gè)服務(wù)器下所有數(shù)據(jù)表信息
這篇文章主要介紹了使用Python實(shí)現(xiàn)ELT統(tǒng)計(jì)多個(gè)服務(wù)器下所有數(shù)據(jù)表信息,ETL,是英文Extract-Transform-Load的縮寫,用來描述將數(shù)據(jù)從來源端經(jīng)過抽取(extract)、轉(zhuǎn)換(transform)、加載(load)至目的端的過程,需要的朋友可以參考下2023-07-07python3中替換python2中cmp函數(shù)的實(shí)現(xiàn)
這篇文章主要介紹了python3替換python2中cmp函數(shù),文中通過示例代碼介紹的非常詳細(xì),對(duì)大家的學(xué)習(xí)或者工作具有一定的參考學(xué)習(xí)價(jià)值,需要的朋友們下面隨著小編來一起學(xué)習(xí)學(xué)習(xí)吧2019-08-08