最新要闻

广告

手机

iphone11大小尺寸是多少?苹果iPhone11和iPhone13的区别是什么?

iphone11大小尺寸是多少?苹果iPhone11和iPhone13的区别是什么?

警方通报辅警执法直播中被撞飞:犯罪嫌疑人已投案

警方通报辅警执法直播中被撞飞:犯罪嫌疑人已投案

家电

数据挖掘第四周作业

来源:博客园


(相关资料图)

第一部分——商品零售购物篮分析

代码一:查看数据特征

import numpy as npimport pandas as pdinputfile="D:\python_data\GoodsOrder.csv"   #输入的数据文件data=pd.read_csv(inputfile,encoding="gbk")  #读取数据data.info() #查看数据属性data=data["id"]description=[data.count(),data.min(),data.max()]    #依次计算总数、最小值、最大值description=pd.DataFrame(description,index=["Count","Min","Max"]).Tprint("描述性统计结果:\n",np.round(description))   #输出结果RangeIndex: 43367 entries, 0 to 43366
Data columns (total 2 columns): #   Column  Non-Null Count  Dtype ---  ------  --------------  -----  0   id      43367 non-null  int64  1   Goods   43367 non-null  objectdtypes: int64(1), object(1)memory usage: 677.7+ KB描述性统计结果:    Count  Min   Max0  43367    1  9835代码二:分析热销商品
inputfile = "D:\python_data\GoodsOrder.csv"data = pd.read_csv(inputfile,encoding="gbk")group = data.groupby(["Goods"]).count().reset_index()sorted = group.sort_values("id",ascending=False)print("销量排行前10商品的销量:\n",sorted[:10])
销量排行前10商品的销量:      Goods    id7     全脂牛奶  25138     其他蔬菜  1903155    面包卷  1809134     苏打  1715150     酸奶  137299     瓶装水  108770   根茎类蔬菜  107285    热带水果  1032143    购物袋   969160     香肠   924
import matplotlib.pyplot as pltx = sorted[:10]["Goods"]y = sorted[:10]["id"]plt.figure(figsize=(8,4))plt.barh(x,y)plt.rcParams["font.sans-serif"] = "SimHei"plt.xlabel("销量")plt.ylabel("商品类别")plt.title("商品的销量TOP10 3129")plt.show()data_nums = data.shape[0]for idnex,row in sorted[:10].iterrows():    print(row["Goods"],row["id"],row["id"]/data_nums)
全脂牛奶 2513 0.05794728710770863其他蔬菜 1903 0.0438812922268084面包卷 1809 0.04171374547466968苏打 1715 0.039546198722530956酸奶 1372 0.031636958978024765瓶装水 1087 0.025065141697604168根茎类蔬菜 1072 0.024719256577582033热带水果 1032 0.023796896257523购物袋 969 0.022344178753430026香肠 924 0.021306523393363617代码三:各类别商品的销量及其占比
inputfile1 = "D:\python_data\GoodsOrder.csv"inputfile2 = "D:\python_data\GoodsTypes.csv"data = pd.read_csv(inputfile1,encoding = "gbk")types = pd.read_csv(inputfile2,encoding = "gbk")  # 读入数据group = data.groupby(["Goods"]).count().reset_index()sort = group.sort_values("id",ascending = False).reset_index()data_nums = data.shape[0]  # 总量del sort["index"]sort_links = pd.merge(sort,types)  # 根据type合并两个datafreame# 根据类别求和,每个商品类别的总量,并排序sort_link = sort_links.groupby(["Types"]).sum().reset_index()sort_link = sort_link.sort_values("id",ascending = False).reset_index()del sort_link["index"]  # 删除“index”列# 求百分比,然后更换列名,最后输出到文件sort_link["count"] = sort_link.apply(lambda line: line["id"]/data_nums,axis=1)sort_link.rename(columns = {"count":"percent"},inplace = True)print("各类别商品的销量及其占比:\n",sort_link)outfile1 = "D:\python_data\percent.csv"sort_link.to_csv(outfile1,index = False,header = True,encoding="gbk")  # 保存结果# 画饼图展示每类商品销量占比import matplotlib.pyplot as pltdata = sort_link["percent"]labels = sort_link["Types"]plt.figure(figsize=(8, 6))  # 设置画布大小plt.pie(data,labels=labels,autopct="%1.2f%%")plt.rcParams["font.sans-serif"] = "SimHei"plt.title("每类商品销量占比3129",fontdict={"size": 20})  # 设置标题plt.show()

各类别商品的销量及其占比:Types id percent0 非酒精饮料 7594 0.1751101 西点 7192 0.1658402 果蔬 7146 0.1647803 米粮调料 5185 0.1195614 百货 5141 0.1185465 肉类 4870 0.1122976 酒精饮料 2287 0.0527367 食品类 1870 0.0431208 零食 1459 0.0336439 熟食 541 0.012475

代码四:非酒精饮料内部商品的销量及其占比
# 先筛选“非酒精饮料”类型的商品,然后求百分比,然后输出结果到文件。selected = sort_links.loc[sort_links["Types"] == "非酒精饮料"]  # 挑选商品类别为“非酒精饮料”并排序child_nums = selected["id"].sum()  # 对所有的“非酒精饮料”求和selected["child_percent"] = selected.apply(lambda line: line["id"]/child_nums,axis = 1)  # 求百分比selected.rename(columns = {"id":"count"},inplace = True)print("非酒精饮料内部商品的销量及其占比:\n",selected)outfile2 = "D:\python_data\percent.csv"sort_link.to_csv(outfile2,index = False,header = True,encoding="gbk")  # 输出结果

0 全脂牛奶 2513 非酒精饮料 0.3309193 苏打 1715 非酒精饮料 0.2258365 瓶装水 1087 非酒精饮料 0.14313916 水果/蔬菜汁 711 非酒精饮料 0.09362722 咖啡 571 非酒精饮料 0.07519138 超高温杀菌的牛奶 329 非酒精饮料 0.04332445 其他饮料 279 非酒精饮料 0.03674051 一般饮料 256 非酒精饮料 0.033711101 速溶咖啡 73 非酒精饮料 0.009613125 茶 38 非酒精饮料 0.005004144 可可饮料 22 非酒精饮料 0.002897

# 画饼图展示非酒精饮品内部各商品的销量占比import matplotlib.pyplot as pltdata = selected["child_percent"]labels = selected["Goods"]plt.figure(figsize = (8,6))  # 设置画布大小explode = (0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.08,0.3,0.1,0.3)  # 设置每一块分割出的间隙大小plt.pie(data,explode = explode,labels = labels,autopct = "%1.2f%%",        pctdistance = 1.1,labeldistance = 1.2)plt.rcParams["font.sans-serif"] = "SimHei"plt.title("非酒精饮料内部各商品的销量占比3129",fontdict={"size": 20})  # 设置标题plt.axis("equal")plt.show()  # 展示图形
代码五:数据转换
inputfile=open("D:\python_data\GoodsOrder.csv")data = pd.read_csv(inputfile,encoding = "gbk")# 根据id对“Goods”列合并,并使用“,”将各商品隔开data["Goods"] = data["Goods"].apply(lambda x:","+x)data = data.groupby("id").sum().reset_index()# 对合并的商品列转换数据格式data["Goods"] = data["Goods"].apply(lambda x :[x[1:]])data_list = list(data["Goods"])# 分割商品名为每个元素data_translation = []for i in data_list:    p = i[0].split(",")    data_translation.append(p)print("数据转换结果的前5个元素:\n", data_translation[0:5])

数据转换结果的前5个元素: [["柑橘类水果", "人造黄油", "即食汤", "半成品面包"], ["咖啡", "热带水果", "酸奶"], ["全脂牛奶"], ["奶油乳酪", "肉泥", "仁果类水果", "酸奶"], ["炼乳", "长面包", "其他蔬菜", "全脂牛奶"]]

代码六:构建关联规则模型

from numpy import *def loadDataSet():    return [["a", "c", "e"], ["b", "d"], ["b", "c"], ["a", "b", "c", "d"], ["a", "b"], ["b", "c"], ["a", "b"],            ["a", "b", "c", "e"], ["a", "b", "c"], ["a", "c", "e"]]def createC1(dataSet):    C1 = []    for transaction in dataSet:        for item in transaction:            if not [item] in C1:                C1.append([item])    C1.sort()    # 映射为frozenset唯一性的,可使用其构造字典    return list(map(frozenset, C1))# 从候选K项集到频繁K项集(支持度计算)def scanD(D, Ck, minSupport):    ssCnt = {}    for tid in D:  # 遍历数据集        for can in Ck:  # 遍历候选项            if can.issubset(tid):  # 判断候选项中是否含数据集的各项                if not can in ssCnt:                    ssCnt[can] = 1  # 不含设为1                else:                    ssCnt[can] += 1  # 有则计数加1    numItems = float(len(D))  # 数据集大小    retList = []  # L1初始化    supportData = {}  # 记录候选项中各个数据的支持度    for key in ssCnt:        support = ssCnt[key] / numItems  # 计算支持度        if support >= minSupport:            retList.insert(0, key)  # 满足条件加入L1中            supportData[key] = support    return retList, supportDatadef calSupport(D, Ck, min_support):    dict_sup = {}    for i in D:        for j in Ck:            if j.issubset(i):                if not j in dict_sup:                    dict_sup[j] = 1                else:                    dict_sup[j] += 1    sumCount = float(len(D))    supportData = {}    relist = []    for i in dict_sup:        temp_sup = dict_sup[i] / sumCount        if temp_sup >= min_support:            relist.append(i)            # 此处可设置返回全部的支持度数据(或者频繁项集的支持度数据)            supportData[i] = temp_sup    return relist, supportData# 改进剪枝算法def aprioriGen(Lk, k):    retList = []    lenLk = len(Lk)    for i in range(lenLk):        for j in range(i + 1, lenLk):  # 两两组合遍历            L1 = list(Lk[i])[:k - 2]            L2 = list(Lk[j])[:k - 2]            L1.sort()            L2.sort()            if L1 == L2:  # 前k-1项相等,则可相乘,这样可防止重复项出现                # 进行剪枝(a1为k项集中的一个元素,b为它的所有k-1项子集)                a = Lk[i] | Lk[j]  # a为frozenset()集合                a1 = list(a)                b = []                # 遍历取出每一个元素,转换为set,依次从a1中剔除该元素,并加入到b中                for q in range(len(a1)):                    t = [a1[q]]                    tt = frozenset(set(a1) - set(t))                    b.append(tt)                t = 0                for w in b:                    # 当b(即所有k-1项子集)都是Lk(频繁的)的子集,则保留,否则删除。                    if w in Lk:                        t += 1                if t == len(b):                    retList.append(b[0] | b[1])    return retListdef apriori(dataSet, minSupport=0.2):    # 前3条语句是对计算查找单个元素中的频繁项集    C1 = createC1(dataSet)    D = list(map(set, dataSet))  # 使用list()转换为列表    L1, supportData = calSupport(D, C1, minSupport)    L = [L1]  # 加列表框,使得1项集为一个单独元素    k = 2    while (len(L[k - 2]) > 0):  # 是否还有候选集        Ck = aprioriGen(L[k - 2], k)        Lk, supK = scanD(D, Ck, minSupport)  # scan DB to get Lk        supportData.update(supK)  # 把supk的键值对添加到supportData里        L.append(Lk)  # L最后一个值为空集        k += 1    del L[-1]  # 删除最后一个空集    return L, supportData  # L为频繁项集,为一个列表,1,2,3项集分别为一个元素# 生成集合的所有子集def getSubset(fromList, toList):    for i in range(len(fromList)):        t = [fromList[i]]        tt = frozenset(set(fromList) - set(t))        if not tt in toList:            toList.append(tt)            tt = list(tt)            if len(tt) > 1:                getSubset(tt, toList)def calcConf(freqSet, H, supportData, ruleList, minConf=0.7):    for conseq in H:  # 遍历H中的所有项集并计算它们的可信度值        conf = supportData[freqSet] / supportData[freqSet - conseq]  # 可信度计算,结合支持度数据        # 提升度lift计算lift = p(a & b) / p(a)*p(b)        lift = supportData[freqSet] / (supportData[conseq] * supportData[freqSet - conseq])        if conf >= minConf and lift > 1:            print(freqSet - conseq, "-->", conseq, "支持度", round(supportData[freqSet], 6), "置信度:", round(conf, 6),                  "lift值为:", round(lift, 6))            ruleList.append((freqSet - conseq, conseq, conf))# 生成规则def gen_rule(L, supportData, minConf=0.7):    bigRuleList = []    for i in range(1, len(L)):  # 从二项集开始计算        for freqSet in L[i]:  # freqSet为所有的k项集            # 求该三项集的所有非空子集,1项集,2项集,直到k-1项集,用H1表示,为list类型,里面为frozenset类型,            H1 = list(freqSet)            all_subset = []            getSubset(H1, all_subset)  # 生成所有的子集            calcConf(freqSet, all_subset, supportData, bigRuleList, minConf)    return bigRuleListif __name__ == "__main__":    dataSet = data_translation    L, supportData = apriori(dataSet, minSupport=0.02)    rule = gen_rule(L, supportData, minConf=0.35)

frozenset({"水果/蔬菜汁"}) --> frozenset({"全脂牛奶"}) 支持度 0.02664 置信度: 0.368495 lift值为: 1.44216frozenset({"人造黄油"}) --> frozenset({"全脂牛奶"}) 支持度 0.024199 置信度: 0.413194 lift值为: 1.617098frozenset({"仁果类水果"}) --> frozenset({"全脂牛奶"}) 支持度 0.030097 置信度: 0.397849 lift值为: 1.557043frozenset({"牛肉"}) --> frozenset({"全脂牛奶"}) 支持度 0.021251 置信度: 0.405039 lift值为: 1.58518frozenset({"冷冻蔬菜"}) --> frozenset({"全脂牛奶"}) 支持度 0.020437 置信度: 0.424947 lift值为: 1.663094frozenset({"本地蛋类"}) --> frozenset({"其他蔬菜"}) 支持度 0.022267 置信度: 0.350962 lift值为: 1.813824frozenset({"黄油"}) --> frozenset({"其他蔬菜"}) 支持度 0.020031 置信度: 0.361468 lift值为: 1.868122frozenset({"本地蛋类"}) --> frozenset({"全脂牛奶"}) 支持度 0.029995 置信度: 0.472756 lift值为: 1.850203frozenset({"黑面包"}) --> frozenset({"全脂牛奶"}) 支持度 0.025216 置信度: 0.388715 lift值为: 1.521293frozenset({"糕点"}) --> frozenset({"全脂牛奶"}) 支持度 0.033249 置信度: 0.373714 lift值为: 1.462587frozenset({"酸奶油"}) --> frozenset({"其他蔬菜"}) 支持度 0.028876 置信度: 0.402837 lift值为: 2.081924frozenset({"猪肉"}) --> frozenset({"其他蔬菜"}) 支持度 0.021657 置信度: 0.375661 lift值为: 1.941476frozenset({"酸奶油"}) --> frozenset({"全脂牛奶"}) 支持度 0.032232 置信度: 0.449645 lift值为: 1.759754frozenset({"猪肉"}) --> frozenset({"全脂牛奶"}) 支持度 0.022166 置信度: 0.38448 lift值为: 1.504719frozenset({"根茎类蔬菜"}) --> frozenset({"全脂牛奶"}) 支持度 0.048907 置信度: 0.448694 lift值为: 1.756031frozenset({"根茎类蔬菜"}) --> frozenset({"其他蔬菜"}) 支持度 0.047382 置信度: 0.434701 lift值为: 2.246605frozenset({"凝乳"}) --> frozenset({"全脂牛奶"}) 支持度 0.026131 置信度: 0.490458 lift值为: 1.919481frozenset({"热带水果"}) --> frozenset({"全脂牛奶"}) 支持度 0.042298 置信度: 0.403101 lift值为: 1.577595frozenset({"柑橘类水果"}) --> frozenset({"全脂牛奶"}) 支持度 0.030503 置信度: 0.36855 lift值为: 1.442377frozenset({"黄油"}) --> frozenset({"全脂牛奶"}) 支持度 0.027555 置信度: 0.497248 lift值为: 1.946053frozenset({"酸奶"}) --> frozenset({"全脂牛奶"}) 支持度 0.056024 置信度: 0.401603 lift值为: 1.571735frozenset({"其他蔬菜"}) --> frozenset({"全脂牛奶"}) 支持度 0.074835 置信度: 0.386758 lift值为: 1.513634frozenset({"酸奶", "其他蔬菜"}) --> frozenset({"全脂牛奶"}) 支持度 0.022267 置信度: 0.512881 lift值为: 2.007235frozenset({"酸奶", "全脂牛奶"}) --> frozenset({"其他蔬菜"}) 支持度 0.022267 置信度: 0.397459 lift值为: 2.054131frozenset({"其他蔬菜", "根茎类蔬菜"}) --> frozenset({"全脂牛奶"}) 支持度 0.023183 置信度: 0.48927 lift值为: 1.914833frozenset({"全脂牛奶", "根茎类蔬菜"}) --> frozenset({"其他蔬菜"}) 支持度 0.023183 置信度: 0.474012 lift值为: 2.44977

代码七:西点内部商品的销量及其占比

import seaborn as sns#西点selected = sort_links.loc[sort_links["Types"] == "西点"]  # 挑选商品类别为“西点”并排序# 绘制西点类别中不同商品占比的条形图plt.figure(figsize=(10, 5))sns.barplot(x=list(selected["id"]), y=list(selected["Goods"]))plt.xlabel("商品销量")plt.ylabel("商品类别")plt.rcParams["font.sans-serif"] = "SimHei"plt.title("西点类别中不同商品的销量3129")plt.show()# 先筛选“西点”类型的商品,然后求百分比,然后输出结果到文件。selected = sort_links.loc[sort_links["Types"] == "西点"]  # 挑选商品类别为“西点”并排序child_nums = selected["id"].sum()  # 对所有的“西点”求和selected["child_percent_xidian"] = selected.apply(lambda line: line["id"]/child_nums,axis = 1)  # 求百分比selected.rename(columns = {"id":"count"},inplace = True)print("西点内部商品的销量及其占比:\n",selected)outfile3 = "D:\python_data\child_percent_xidian.csv"sort_link.to_csv(outfile3,index = False,header = True,encoding="gbk")  # 输出结果# 画饼图展示西点内部各商品的销量占比data = selected["child_percent_xidian"]labels = selected["Goods"]plt.figure(figsize = (8,6))  # 设置画布大小explode = (0.05,0.04,0.04,0.05,0.06,0.07,0.03,0.03,0.03,0.02,0.03,0.02,0.02,0.02,0.02,0.08,0.3,0.34,0.38,0.4,0.8)  # 设置每一块分割出的间隙大小plt.pie(data,explode = explode,labels = labels,autopct = "%1.2f%%",        pctdistance = 1.1,labeldistance = 1.2)plt.rcParams["font.sans-serif"] = "SimHei"plt.title("西点内部各商品的销量占比3129",fontdict={"size": 20})  # 设置标题plt.axis("equal")plt.show()  # 展示图形

2 面包卷 1809 西点 0.25152910 糕点 875 西点 0.12166318 黑面包 638 西点 0.08871031 白面包 414 西点 0.05756432 奶油乳酪 390 西点 0.05422733 威化饼 378 西点 0.05255834 咸点心 372 西点 0.05172435 长面包 368 西点 0.05116836 甜点 365 西点 0.05075148 酪 275 西点 0.03823754 切片奶酪 241 西点 0.03350955 硬奶酪 241 西点 0.03350964 半成品面包 174 西点 0.02419468 软奶酪 168 西点 0.02335974 风味蛋糕 130 西点 0.01807692 甜食 89 西点 0.01237594 特色奶酪 84 西点 0.011680103 面包干 68 西点 0.009455116 干面包 50 西点 0.006952117 凝乳酪 50 西点 0.006952152 奶油 13 西点 0.001808

关键词: