|  | 
| #效果演示:同步17k,当然,我只采集VIP小说。后台挂着,只要17k有vip小说更新,则自动同步。 #!coding=UTF-8
 import urllib
 import urllib2
 import commands
 import time
 import threading
 import os
 import re
 import sys
 
 thlen = 10
 #定义同时采集的线程数
 books = []
 #定义需要采集的书库
 tsk = []
 #定义采集线程数组
 bookdict = {}
 #定义已采集图书字典,key为目标站书号,value为字数
 
 
 domain = 'yoursite.domain.com'
 adminuser = 'admin'
 adminpass = '******'
 siteid = '23'
 # notaddnew = '0'
 
 frompage = 'http://all.17k.com/lib/book/2_0_0_0_0_0_2_0_1.html'
 
 
 
 def addbooklist():
 while 1:
 time.sleep(30)
 print '[' + time.strftime('%H:%M:%S')  + '] 采集更新列表线程启动。'
 start = time.time()
 try:
 response = urllib2.urlopen(frompage, timeout = 12)
 content = response.read()
 except:
 continue
 response.close()
 elapsed = (time.time() - start)
 bookattr = re.findall(r'<a class=\"jt\" rel=\"/tip\.xhtml\?book\.id=([0-9]+)\&difference[^>]+>([^<]+)</a>*[\s\S]*?<td class=\"td5\">([0-9]+)</td>',content,re.M)
 print '[' + time.strftime('%H:%M:%S')  + '] 采集更新列表结束,用时:' + str(elapsed) + '秒'
 for ii in range(len(bookattr)):
 newbookid = bookattr[ii][0]
 newbookname = bookattr[ii][1]
 newbooksize = bookattr[ii][2]
 inlist = False
 for tt in range(len(books)):
 if (books[tt][0]==newbookid):
 inlist = True
 if not inlist:
 #书号不在待采集数组里
 if (newbookid in bookdict.keys()):
 #书号在已采集过的字典里(需要根据字数来判断是否有更新)
 if (int(newbooksize)>int(bookdict[newbookid])):
 #采集到书籍字数大于已采集字典里的字数(添加到待采集列表)
 books.append([newbookid,newbookname,newbooksize])
 print '书号:' + newbookid + '有更新,旧字数:'+ bookdict[newbookid] + ' 新字数:'+ newbooksize + '  添加到待采集列表。'
 else:
 #书号不在已采集过的字典里(添加到待采集列表)
 books.append([newbookid,newbookname,newbooksize])
 print '书号:' + newbookid + '最近未采集,添加到待采集列表。'
 print '[' + time.strftime('%H:%M:%S')  + '] 采集更新列表线程完成,线程休眠。'
 
 
 def caiji(bookid,bookname,booksize):
 print '正在采集 书号[' + bookid + '] 书名:' + bookname
 url = 'http://'+ domain + '/modules/article/admin/batchcollect.php?action=bcollect&siteid=' + siteid + '&batchids=' + bookid + '&jieqi_username=' + adminuser + '&jieqi_userpassword=' + adminpass
 start = time.time()
 page = urllib2.urlopen(url,timeout=3600)
 data = page.read(8192)
 while data:
 data = page.read(8192)
 page.close()
 elapsed = (time.time() - start)
 time.sleep(5) #采集完等5秒生成全书
 print '书号[' + bookid + '] 书名:' + bookname + '字数:' + booksize + 'k 采集完成! 用时:' + str(elapsed) + '秒'
 print '书号[' + bookid + '] 书名:' + bookname + '字数:' + booksize + 'k 添加到最近采集书目字典。'
 
 
 # 从网页获取要采集的文章ID和文章名字(首次)
 start = time.time()
 response = urllib2.urlopen(frompage, timeout = 12)
 content = response.read()
 response.close()
 elapsed = (time.time() - start)
 getattr = re.findall(r'<a class=\"jt\" rel=\"/tip\.xhtml\?book\.id=([0-9]+)\&difference[^>]+>([^<]+)</a>*[\s\S]*?<td class=\"td5\">([0-9]+)</td>',content,re.M)
 #getsize = re.findall(r'<td class=\"td5\">([0-9]+)</td>',content,re.M)
 print '首次获取要采集的文章共' + str(len(getattr)) +'篇,用时:' + str(elapsed) + '秒'
 books = books + getattr
 if (len(books)<3):
 print('获取列表页失败,退出!')
 exit()
 
 #启动书籍列表采集线程
 thaddbooklist = threading.Thread(target=addbooklist,name='taddbooklist')
 thaddbooklist.start()
 
 for x in range(thlen):
 bookid = books[0][0]
 bookname = books[0][1]
 booksize = books[0][2]
 tname = 't' + str(x)
 th = threading.Thread(target=caiji,name=tname,args=(bookid,bookname,booksize))
 th.start()
 del books[0]
 bookdict[bookid] = booksize
 tsk.append(th)
 
 #检测空闲线程,当线程闲置时,若待采集列表不为空时,启用该线程进行采集
 while 1:
 time.sleep(5)
 for i in range(len(tsk)):
 if not tsk[i].is_alive():
 print tsk[i].name + '线程空闲'
 if len(books) > 0:
 bookid = books[0][0]
 bookname = books[0][1]
 booksize = books[0][2]
 th = threading.Thread(target=caiji,name=tsk[i].name,args=(bookid,bookname,booksize))
 th.start()
 del books[0]
 bookdict[bookid] = booksize
 tsk[i] = th
 
 
 #siteid = '23' 采集规则的ID  杰奇后台和配置文件里看
 
 
 补充运行输出:
 
 [11:10:44] 采集更新列表线程启动。
 [11:10:45] 采集更新列表结束,用时:0.368046998978秒
 书号:1257715无更新,旧字数:508549 新字数:508549  忽略。
 书号:437108无更新,旧字数:3070245 新字数:3070245  忽略
 
 #自动登录后台,自动抢购脚本。
 #vmid:自己登录后创建VPS,然后看源码。里面有vmid字样
 email:登录用户名email
 password:这个不说了。。。
 代码复制下来 保存为op.py
 linux下运行命令 python /目录/op.py
 目前linux一般都带python,哪怕是最小安装。
 没有的话
 Debian/Ubuntu apt-get install python
 CentOS/Rhel yum install python
 ------------------------
 #  -*- coding: utf-8 -*-
 # !/usr/bin/python
 ################   vpskk原创作品,转载请注明出处http://www.vpskk.com    #####################
 
 import urllib2
 import urllib
 import cookielib
 import re
 import time
 
 vmid = "your vm id";
 email = "your email";
 password = "your password";
 
 auth_url = 'https://panel.op-net.com/login'
 check_url = 'https://panel.op-net.com/cloud/open';
 create_url = 'https://panel.op-net.com/cloud/open';
 
 hk_check_str = '<span>Hong Kong';
 jp_check_str = '<span>Tokyo';
 
 
 # 登陆用户名和密码
 data={
 "email":email,
 "password":password,
 "submit":"Sign in"
 }
 
 #准备创建vm检测页面数据
 checkdata={
 "vm_id":vmid,
 "x":"19",
 "y":"24",
 }
 i = 1
 
 # 初始化一个CookieJar来处理Cookie
 cookieJar=cookielib.CookieJar()
 # 实例化一个全局opener
 opener=urllib2.build_opener(urllib2.HTTPCookieProcessor(cookieJar))
 
 while 1:
 try:
 time.sleep(10)
 print '[' + time.strftime('%H:%M:%S')  + ']第' + str(i) +'次检测。'
 i = i +1
 # urllib进行编码
 post_data=urllib.urlencode(data)
 check_data=urllib.urlencode(checkdata)
 # 发送头信息
 headers ={
 "Host":"panel.op-net.com",
 "Referer":auth_url,
 "User-Agent":"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.152 Safari/537.36",
 }
 
 # 访问检查页 如果获取到csrf则不需要登陆
 reqcheck=urllib2.Request(check_url,check_data,headers)
 resultcheck = opener.open(reqcheck)
 csrf=re.findall(r'csrf_token" value="([0-9a-z]+)"/>',resultcheck.read(),re.M)
 if (len(csrf)>0):#找到csrf,不用再登陆
 print "have csrf:" + csrf[0]
 else:
 print "No csrf,auto login......"
 # 登陆获取cookie
 req=urllib2.Request(auth_url,post_data,headers)
 result = opener.open(req)
 print "Login OK,next get csrf."
 # 访问检查页 重新获取csrf
 reqcheck=urllib2.Request(check_url,check_data,headers)
 resultcheck = opener.open(reqcheck)
 html = resultcheck.read()
 csrf=re.findall(r'csrf_token" value="([0-9a-z]+)"/>',html,re.M)
 if (len(csrf)>0):#找到csrf
 print "new get csrf:" + csrf[0]
 else:
 print "no csrf,quit."
 continue
 
 if not hk_check_str in html:
 print "HK is available,next create it!"
 location = "13"
 elif not jp_check_str in html:
 print "JP is available,next create it!"
 location = "14"
 else:
 print "HK and JP is unavaileable,quit."
 continue
 
 # 创建VM数据
 create={
 'csrf_token':csrf[0],
 'plan':'Plan 01',
 'vm_id':vmid,
 'location':location,
 'os':'linux-debian-6.0.6-x86_64-min-gen2-v1',
 'hostname':'op.vpskk.com',
 'root':'',
 }
 # urllib进行编码
 create_data=urllib.urlencode(create)
 reqcreate=urllib2.Request(create_url,create_data,headers)
 result = opener.open(reqcreate)
 if "The requested location is currently unavailable" in result.read():
 print "unavailable...waiting for checking again......"
 else:
 print "Create VM OK"
 break
 except:
 continue
 | 
 |