python推荐淘宝物美价廉商品

2016-12-31 17:17

完成的目标:

  输入搜索的商品 以及 淘宝的已评价数目、店铺的商品描述(包括如实描述、服务态度、快递的5.0打分);

  按要求,晒选出要求数量的结果,并按“物美价廉算法”排序后输出    

思路:

1,利用淘宝搜索'https://s.taobao.com/search?'的价格filter 先进行价格筛选,得到结果的网站

2,用urllib打开结果网站,构造正则表达式匹配出各个商品结果的 价格、已评价数量、店铺的如实描述等信息;

  并把结果保存至二维数组里。

3,利用商品及店铺信息,用“物美价廉算法”给各个商品打分

4,按打分排序, 各个信息总结果按排序输出到新建txt文件里;

  并将各个商品图片下载到文件及建立相同排序开头的txt(其名字包好简要的商品信息),这样图片和商品信息同时能在一个文件夹里用大图排列看到。

5.,可以把输入的参数(价格范围等要求)以函数输入,,用pyinstaller 把整个py程序打包为EXE 就可以发布了。

 

如要求条件为:

reserch_goods='ssd120g' #淘宝搜索词
keyword='.'#raw_input().decode("gbk").encode("utf-8") #个人限定词,商品名字必须包含,防止淘宝推荐了其他相关词 (正则表达式). 为任意表示不作限制
price_min=22 #价格区间
price_max=400
descripHrequ=0 # % 默认高于average, 输出结果大于此值
servHrequ=0 # % 默认高于average, 输出结果大于此值
descripNrequ=6
counts=10 #要求选出多少个商品

 

结果显示在results文件里

python推荐淘宝物美价廉商品0

python推荐淘宝物美价廉商品1

 

 

源代码如下:

# -*- coding: utf-8 -*-
import urllib
import urllib2
import re
import time 
import random
import os
from math import log
from math import log10
from math import sqrt
import sys

reload(sys)  
sys.setdefaultencoding('utf8') 

class counter(object):
    #计数器
    def __init__(self):
        self.count  = 0
        self.try_time = 0
        self.fail_time = 0
        self.url_list = []
        self.new_flag = True
        self.results=[]
        self.p=0
        self.d=0

    def print_counter(self):
        print 'try_time:', self.try_time,   "  get_count:" , self.count,   "  fail_time:",self.fail_time

counter1 = counter()


def post_request(url):

    #使用代理
    proxy = {'http':'27.24.158.155:84'}
    proxy_support = urllib2.ProxyHandler(proxy)
    # opener = urllib2.build_opener(proxy_support,urllib2.HTTPHandler(debuglevel=1))
    opener = urllib2.build_opener(proxy_support)
    urllib2.install_opener(opener)

    #构造随机头部文件访问请求
    User_Agents=["Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0",
    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36",
    "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", #
    "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
    "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11O"
    ]
    random_User_Agent = random.choice(User_Agents)
    #print random_User_Agent

    req =urllib2.Request(url) #!!

    req.add_header("User-Agent",random_User_Agent)
    req.add_header("GET",url)
    req.add_header("Referer",url)
    return req


def recommend_rate(price,description,delivery,service,comments):
    #描述为绝对值
    av_p=counter1.p/counter1.count
    av_d=counter1.d/counter1.count
    rate=(description/av_d)**20*(description+delivery+service)*(av_p/(price))**0.1+log((comments+5),1000)
    print 'all count=',counter1.count
    print "avrage price=",av_p,';',av_p/(price),';price',price,';comments=',comments,';descrip=',description
    print 'rate=',rate,'(price)yinzi',(av_p/(price))**0.1,'descrip_yinzi',(description/av_d)**20,'comments_factor=',log((comments+50),100)
    return rate


def product_rank(list):
    for x in list:
        #0开始为  x0商品名 、x1图片链接、x2商品链接、x3价格、x4评论数、 x5店铺名、 x6快递情况、x7描述相符情况3个、x8服务情况 
        rate=recommend_rate(x[3],x[7],x[6],x[8],x[4])
        x.append(rate)


def get_user_rate(item_url):
    #暂时未使用该功能
    '''获取卖家信用情况;未登录情况不能访问,或者需要在头部文件中加入cookie。。。;'''
    html=urllib2.urlopen(item_url)
    #"//rate.taobao.com/user-rate-282f910f3b70f2128abd0ee9170e6428.htm"
    regrex_rate='"(//.*?user\-rate.*?)"'
    codes= re.findall(regrex_rate,html.read())
    html.close()

    user_rate_url= 'http:'+codes[0]
    print 'uu', user_rate_url

    user_rate_html = urllib2.urlopen(user_rate_url)
    print user_rate_html.read()
    #title="4.78589分"
    desc_regex=u'title="(4.[0-9]{5}).*?'
    de_pat=re.compile(desc_regex)
    
    descs = re.findall(de_pat,user_rate_html.read())
    print len(descs)
    item_url='https://item.taobao.com/item.htm?id=530635294653&ns=1&abbucket=0#detail'
#get_user_rate(item_url)
'''获取卖家信用情况;未登录情况不能访问。。。暂时 无用'''


def get_praised_good(url,file_open,keyword,counts,descripHrequ,servHrequ,descripNrequ):
    #从给定的淘宝链接中 获取符合条件的商品list
    html=urllib2.urlopen(post_request(url))
    code=html.read()
    html.close()

    regrex2=ur'raw_title":"(.*?)","pic_url":"(.*?)","detail_url":"(.*?)","view_price":"(.*?)".*?"comment_count":"(.*?)".*?"nick":"(.*?)".*?"delivery":\[(.*?),(.*?),(.*?)\],"description":\[(.*?),(.*?),(.*?)\],"service":\[(.*?),(.*?),(.*?)\]' 
    #每一个匹配项 返回  15个 字符串 
    #x[0]开始为  x0商品名 、x1图片链接、x2商品链接、x3价格、x4评论数、 x5店铺名、 x6快递情况3个、x9描述相符情况3个、x12服务情况3个
    pat=re.compile(regrex2)
    meet_code=re.findall(regrex2,code)#

    for x in meet_code:
        if counter1.count>=counts :
            print "have get enough pruducts"
            break
        description_higher=int(x[10])*float(x[11])/100
        service_higher=int(x[13])*float(x[14])/100
        try:
            x4=int(x[4])#description_count
        except:
            x4=0
        if  (description_higher>=descripHrequ) and (service_higher>=servHrequ) and x4>=descripNrequ:
            if re.findall(keyword,x[0]) :# 中文keyword在结果中匹配问题暂时没有解决,,直接加在搜索词里吧 
                x0=x[0].replace(' ','').replace('/','')
                detail_url='http:'+x[2].decode('unicode-escape').encode('utf-8')
                x1='http:'+x[1].decode('unicode-escape').encode('utf-8')
                #print type(x)
                if detail_url  in counter1.url_list:
                    counter1.new_flag=False
                    print 'no more new met products'
                    print counter1.url_list
                    print detail_url
                    break
                counter1.url_list.append(detail_url)
                counter1.try_time+=1
                counter1.count+=1

                x11=float(x[11])/100
                x9=float(x[9])/100
                x12=float(x[12])/100
                x6=float(x[6])/100
                x3=float(x[3])
                counter1.p+=x3
                counter1.d+=x9
                x5=unicode(x[5],'utf-8')
                                
                result_list=[]
                result_list.append(x0)
                result_list.append(x1)
                result_list.append(detail_url)
                result_list.append(x3)
                result_list.append(x4)
                result_list.append(x5)
                result_list.append(x6)
                result_list.append(x9)
                result_list.append(x12)
                #0开始为  x0商品名 、x1图片链接、x2商品链接、x3价格、x4评论数、 x5店铺名、 x6快递情况、x7描述相符情况、x8服务情况 
                counter1.results.append(result_list)
            

def save_downpic(lis,file_open,savePath):
    '''从商品list下载图片到reserve_file_path,并写入信息至fileopen'''
    #0开始为  x0商品名 、x1图片链接、x2商品链接、x3价格、x4评论数、 x5店铺名、 x6快递情况、x7描述相符情况、x8服务情况、x9:rate
    len_list=len(lis)
    print len_list
    cc=0        
    for x in lis:
        if True:
            urllib.urlretrieve(x[1],savePath+'\\%s___'%cc +unicode(x[0],'utf-8')+'.jpg')

            txt_name=savePath+'\\'+ '%s__'%cc+ 'custome_description_%s __'%x[7] +'__comments_%s_'%x[4]+ '___price_%srmb___'%x[3] +x[5] +'.txt'
                    
            file_o=open(txt_name,'a')
            file_o.write(x[2])
            file_o.close()
            
            print '\nget_one_possible_fine_goods:\n','good_name:',x[0].decode('utf-8')
            print 'rate=',x[9]
            print 'price:',x[3],x[5]#.decode('utf-8')
            print 'custome_description:',x[7],'--','described_number:',x[4],'  service:',x[8]
            print x[2].decode('utf-8'),'\ngood_pic_url:',x[1].decode('utf-8')

            print txt_name
            print cc+1,"th"

            file_open.write(u'%s__'%cc +u'%s'%x[0]+'\nprice:'+str(x[3])+'¥,'+'\n'+str(x[2])+'  \n'+str(x[5])+'\ncustomer_description:'+str(x[7])+'described_number:'+str(x[4])+'\n\n\n')
            
            print 'get one -^-'
        # except :
        #     print "failed to down picture or creat txt"
        #     counter1.fail_time += 1
        
        cc+=1
        time.sleep(0.5)


def get_all_praised_goods(reserch_goods,counts,savePath,price_min,price_max,descripHrequ,servHrequ ,descripNrequ):
    #边里搜索结果每一页
    #initial url and page number
    initial_url='https://s.taobao.com/search?q='+reserch_goods+'&filter=reserve_price%5B'+'%s'%price_min+'%2C'+'%s'%price_max+'%5D&s='
    print "initial_url",initial_url
    page_n=0
    reserve_file=savePath+r'\found_goods.txt'
    file_open=open(reserve_file,'a')

    file_open.write('****************************\n')
    file_open.write(time.ctime())
    file_open.write('\n****************************\n')

    while counter1.new_flag and counter1.count<counts :
        
        url_1=initial_url+'%s'%(44*page_n)
        #print initial_url
        print 'url_1:', url_1
        #print 'ss',initial_url+'%s'%(44*page_n)
        page_n += 1

        get_praised_good(url_1,file_open,keyword,counts,descripHrequ,servHrequ ,descripNrequ)
        time.sleep(2)
        # except:
        print "%s"%page_n,"pages have been searched"            
        if page_n >=11 :
            print "check keyword,maybe too restrict"
            break
    print url_1        
    product_rank(counter1.results)

    counter1.results.sort(key=lambda x :x[9],reverse=True)        

    save_downpic(counter1.results,file_open,savePath)
    
    #
    for a in  counter1.results:
        for b in a :
            file_open.write(unicode(str(b),'utf-8'))
            file_open.write('\t')
        file_open.write('\n\n')
    
    file_open.close()
    counter1.print_counter()


def input_para_inner():
    reserch_goods='ssd120g'     #淘宝搜索词
    keyword='.'#raw_input().decode("gbk").encode("utf-8")        #个人限定词,商品名字必须包含,防止淘宝推荐了其他相关词 (正则表达式). 为任意表示不作限制
    price_min=22            #价格区间
    price_max=400
    descripHrequ=0   # %   默认高于average, 输出结果大于此值
    servHrequ=0      # %  默认高于average, 输出结果大于此值
    descripNrequ=6
    counts=10                        #要求选出多少个商品

    #savePath=r"C:\Users\Administrator\Desktop\Python scrapy\find_worthy_goods\results"#结果保存路径
    savePath=r"results"#结果保存路径
    while os.path.exists(savePath):
        savePath =savePath+'%s'%random.randrange(0,100)
    if not os.path.exists(savePath):
        os.makedirs(savePath)
    
    get_all_praised_goods(reserch_goods,counts,savePath,price_min,price_max ,descripHrequ,servHrequ,descripNrequ)


def input_para_byinterface():
    print '说明:\n本程序用于在淘宝上搜索商品时主动通过 价格范围、商品描述、服务态度、评论数来筛选商品;\n筛选出来的商品图片下载保存到磁盘(默认桌面新建find_worty_goods文件夹)并建立同序号开头的txt文件,图片显示商品,其旁的txt文件名显示价格等关键信息,txt里保存商品的淘宝链接'.decode('utf-8')
    print "please input reserch _goods_name"
    #print "请输入搜索商品名称;注意不能有空格,下同".decode('utf-8')
    reserch_goods=raw_input().replace(' ','')    #淘宝搜索词 ,并去除中间意外输入的空格
    if reserch_goods:
        # #
        # print "please input _keyword that goods name must include:\n(more than one keyword must use Regular Expression); default by no kewords"
        # try:
        #     keyword=raw_input().decode("gbk").encode("utf-8")      #个人限定词,商品名字必须包含,防止淘宝推荐了其他相关词 (正则表达式). 为任意表示不作限制
        # except:
        #     keyword='青'
        # #    
        keyword='.'

        print "\nplease input  _minimal price and _maximal price;   \ndefault by 0,10000\nnext by 'enter'key input nothing means by default,the same below "
        print '请输入价格范围 ;默认0-10000 ;两项用半角逗号","分隔 按回车键确认;什么也不输入代表使用默认值 '.decode('utf-8')
        try:
            price_min, price_max=input()
        except:
            print 'not input or wrong number,use default range'
            price_min, price_max = 0 ,10000
        #    
        print "\nplease input  _description_higher_percent_require and _service_higher__percent_require\n range:(-100,100) ;   \ndefault by 0,0  I.e better than average"
        print '请输入商品描述、服务高于平均值的百分比-100 ~100'.decode('utf-8')
             # %   默认高于average, 输出结果大于此值
        try:
            descripHrequ,servHrequ=input()              
        except:
            print 'not input or wrong number,use default range'
            descripHrequ = 0  # %  默认高于average, 输出结果大于此值
            servHrequ = 0
        #    
        print "\nplease input description count limit,  default more than 1"
        print '输入最低商品评价数,默认大于1'.decode('utf-8')
        try:
            descripNrequ=input()
        except :
            print 'not input or wrong number,use default range'
            descripNrequ=1
        #
            
        print "\nIF customise file reserve path, Y or N  \ndefault/sample as:  C:\\Users\\Administrator\\Desktop\\find_worthy_goods\\results "
        print '是否自定义保存文件目录 Y or N'.decode('utf-8')
        if raw_input()=='Y':
            print "please input path that you want to reserve;  \n "    
            savePath = raw_input()
        else:
            savePath=r"C:\Users\Administrator\Desktop\find_worthy_goods\results"#结果保存路径
        
        while os.path.exists(savePath):
            savePath = savePath+'%s'%random.randrange(1,10)
            #print "the path exist,we'll make a new one"
        try:
            os.makedirs(savePath)
            print 'ok,file_path we reserve results:  %s'%savePath
            print '保存的路径为:'.decode('utf-8')
        except:
            print "failed to make file path\nplease restart program"
            print '创建文件夹失败,请重新启动程序'.decode('utf-8')
            
        #
        print "\nplease input how many results you want,  default by 50"
        print '您要获取的商品数目,默认50'.decode('utf-8')
        try:
            counts=input()
        except :
            counts=50
        #

        get_all_praised_goods(reserch_goods,counts,savePath,price_min,price_max,descripHrequ,servHrequ,decrip_N_req)
        print '\n'
        counter1.print_counter()
        print "finished,please look up in %s"%savePath
        print '下载完成'.decode('utf-8')

        print counter1.results
        input()
    else:
        print "no search goods"
        print '没有输入商品名称'.decode('utf-8')


if __name__=="__main__":
    print '是否自定义'
    x= 1
    keyword =''

    if x==1:
        input_para_inner()
    else:
        input_para_byinterface()
    
    
    #保存图片,以文件名为商品图片名字,并以序号开头
    #同时,输出 价格、商家名,商品描述、服务等 到 txt文本
    #在商品图片看中后,便可按序号查找 
    #按描述、服务评价高于平均,购物体验应该可以的

 

 预计可添加功能:

  交互界面

  MySQL的数据存储,以实现价格变动的比较