找回密码
 立即注册

扫一扫,访问微社区

QQ登录

只需一步,快速开始

查看: 1272|回复: 1

[求助] 【ji】求教这个脚本是什么意思?尤其是intersection、union部分

1

主题

1

帖子

1

积分

贫民

积分
1
Yali 发表于 2020-10-28 15:56:49 | 显示全部楼层 |阅读模式
import pandas as pd
import itertools
import timeit
import re

# initiate timer
start = timeit.default_timer()

# import csv file and convert to list
shop1_csv = pd.read_csv('shop1.csv')
shop2_csv = pd.read_csv('shop2.csv')
shop1_list = shop1_csv.values.tolist()
shop2_list = shop2_csv.values.tolist()

# function to match items from two shops
def matching(shop1, shop2):

    for i, item1 in enumerate(shop1):
        for item2 in shop2:
            # prepare title1 as a list of distinct words
title1_string = str(item1[0])
            title1 = re.findall(r'\w+', title1_string)
            # prepare title2 as a list of distinct words
title2_string = str(item2[0])
            find1 = re.findall('\[.*?\]', title2_string)
            find2 = re.findall('\.*?\', title2_string)
            for reg in find1, find2:
                for bracket in reg:
                    title2_string = title2_string.replace(bracket, "")
            title2 = re.findall(r'\w+', title2_string)
            # calculate jaccard score
intersection1 = set(title1).intersection(set(title2))
            union1 = set(title1).union(title2)
            intersection2 = set(title1[0:6]).intersection(set(title2[0:6]))
            union2 = set(title1[0:6]).union(title2[0:6])
            if len(title1) >= 5 and len(title2) >= 5:
                score1 = round(len(intersection1) / len(union1), 2)
                score2 = round(len(intersection2) / len(union2), 2)
                score = (0.8 * score1) + (0.2 * score2)
            else:
                score = round(len(intersection1) / len(union1), 2)
            if score > 0.499:
                item1.append(item2[0]), item1.append(score), item1.append(item2[1]), item1.append(item2[2]), item1.append(item2[3]), item1.append("")
                if (isinstance(item1[5], int) or isinstance(item1[5], float)) and (isinstance(item2[3], int) or isinstance(item2[3], float)):
                    if item1[5] < item2[3]:
                        us_higher = item2[3] - item1[5]
                    else:
                        us_higher = "Cheaper"
else:
                    us_higher = "Oops, Something Wrong"
item1.append(us_higher)
        # print progress
print('matching: ' + str(i + 1) + '/' + str(len(shop1)))

# sort according to percent
def compare(shop1):

    for i, item1 in enumerate(shop1):
        item1 = [item1[i:i + 7] for i in range(0, len(item1), 7)]
        item1 = [item1[0]] + sorted(item1[1:], key=lambda x: x[1], reverse=True)
        item1 = list(itertools.chain(*item1))
        shop1 = item1

# restrict the length of each row
def cut(shop1):

    for j in range(0, len(shop1)):
        shop1[j] = shop1[j][:77]

matching(shop1_list, shop2_list)
stop1 = timeit.default_timer()
time_matching = stop1 - start
compare(shop1_list)
stop2 = timeit.default_timer()
time_compare = stop2 - stop1
cut(shop1_list)
stop3 = timeit.default_timer()
time_cut = stop3 - stop2

# export result to csv file
headers = ['Competitor Title', 'Competitor Url', 'Competitor Price', 'Competitor Shipping Fee (default)', 'Competitor Shipping Fee (e-packet)', 'Competitor Price Total (default)', 'Competitor Sold']
for k in range(1, 11):
    headers.append('Shopee Title {}'.format(k)), headers.append('Match Score {}'.format(k)), headers.append('Shopee Url {}'.format(k)), headers.append('Shopee Price {}'.format(k)), headers.append('Shopee Price Total {}'.format(k)), headers.append('Check {}'.format(k)), headers.append('Us_Higher {}'.format(k))
empty = []
for i in range(0, len(headers)):
    empty.append("")
shop1_list.append(empty)
matched_df = pd.DataFrame(shop1_list, columns=headers)
export_csv = matched_df.to_csv (r'C:/Users/esther.gao/PycharmProjects/Price Comparison/测试-Y.csv', index=None, encoding='utf-8-sig')

print('final time:' + ' matching time ' + str(time_matching) + ' compare time ' + str(time_compare) + ' cut time ' + str(time_cut))


回复

使用道具 举报

0

主题

956

帖子

956

积分

圣骑士

积分
956
sheeboard 发表于 2020-10-30 15:38:55 | 显示全部楼层
itersection和union是集合操作,交集和并集。
回复 支持 反对

使用道具 举报

您需要登录后才可以回帖 登录 | 立即注册

本版积分规则

快速回复 返回顶部 返回列表