|
这个是将美国专利局的资料爬虫下来储存成excel
import requests
from bs4 import BeautifulSoup
import csv
patent_ary = []
with open('CPC.csv', newline='') as csvFile:
rows = csv.reader(csvFile) # 1.直接读取:读取 CSV 档案内容
# 回圈输出 每一列
for row in rows:
print(row[0])
url = 'http://www.patbase.com/stats/class.php?cpc='+ row[0] #目标网站
resp**e = requests.get(url) #取得网站的url
soup = BeautifulSoup(resp**e.text, 'html.parser') #把html.parser格式的东西储存在resp**e档案(text)里
CPC = soup.find('table').getText() #筛选撷取
print(CPC)
dic = {}
dic['cpc']=row[0]
dic['content']=CPC
patent_ary.append(dic)
import pandas
pdcol_name=['cpc','content']
pd=pandas.DataFrame(patent_ary,index=None,columns=pdcol_name)
pd.to_excel('search_results1.xlsx')
再做断词分辞
from collecti** import Counter
c = Counter(CPC.split())
c.keys()
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords.words('english')
en_stops = set(stopwords.words('english'))
ps = ['.',',',':',';','?','!',' '] #标点符号表
with open(r'C:\Users\bp6ru\Desktop\word\claim2.txt') as f:
for i in c:
if i not in ps:
if i not in en_stops:
print(i)
最後輸出結果為
G:PHYSICS
G01:
INSTRUMENTS;
MEASURING;
TESTING
G06:
COMPUTING;
CALCULATING;
COUNTING
G06T:
IMAGE
DATA
PROCESSING
OR
GENERATION,
IN
GENERAL
G06T7/00:
Image
analysis
G06T7/50:
Depth
shape
recovery
G06T7/521:
laser
ranging,
e.g.
using
interferometry;
projection
structured
light
希望能将最后的输出以阶层方式储存成excel档,例如以下
G:PHYSICS
G01: INSTRUMENTS; MEASURING; TESTING
G06: COMPUTING; CALCULATING; COUNTING
G06T: IMAGE DATA PROCESSING OR GENERATION, IN GENERAL
G06T7/00: Image analysis
G06T7/50: . Depth or shape recovery
G06T7/521: . . from laser ranging, e.g. using interferometry; from the projection of structured light
|
|