python⾃动发布⽂章到wordpress 解决思路
1,利⽤post向wordpress提交表单
2,通过wordpress_xmlrpc模块,有轮⼦不⽤想⼲啥
3,通过mysqldb直接插⼊数据库,有服务器、不需远程,直接把py脚本放在服务器跑
我们这次要⽤轮⼦拼⼀台摩托车!,宝马、、⾃⼰动⼿吧
开始动⼿:需⾃⾏安装的模块requests,xmlrpc;windows系统、linux安装如下,⼟豪随意:
pip install requests
pip install python-wordpress-xmlrpc
caiji.py
1
2
3
4
5
6
7
8
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53#encoding=utf-8
'''练⼿可以wp博客来采集,这个脚本就是针对wp博客来做下⼿采集的'''
import re,requests,time,random,urllib,threading,threadpool
from wordpress_xmlrpc import Client, WordPressPost
from hods.posts import GetPosts, NewPost
'''登录'''
try:
wp=Client('ample/xmlrpc.php','wp的账号','wp的密码')
except Exception, e:
wp=Client('ample/xmlrpc.php','wp的账号','wp的密码')
post=WordPressPost()
'''针对单站url重复采集问题'''
f=open('','a+')
ad()
url_list=[m.strip() for m in open('').readlines()]
daili_list=[]
'''过滤html标签'''
def filter_tags(htmlstr):
re_cdata=repile('//<!\[CDATA\[[^>]*//\]\]>',re.I) #匹配CDATA
re_script=repile('<\s*script[^>]*>[^<]*<\s*/\s*script\s*>',re.I)#Script
re_style=repile('<\s*style[^>]*>[^<]*<\s*/\s*style\s*>',re.I)#style    re_br=repile('<br\s*?/?>')#处理换⾏
re_br=repile('<br />')
re_h=repile('</?\w+[^>]*>')#HTML标签
re_comment=repile('<!--[^>]*-->')#HTML注释
s=re_cdata.sub('',htmlstr)#去掉CDATA
s=re_script.sub('',s) #去掉SCRIPT
s=re_style.sub('',s)#去掉style
s=re_br.sub('\n',s)#将br转换为换⾏
s=re_h.sub('',s) #去掉HTML 标签
s=re_comment.sub('',s)#去掉HTML注释
blank_line=repile('\n+')#去掉多余的空⾏
s=blank_line.sub('\n',s)
return s
'''轮换user-agent'''
def getUA():
uaList =[
'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+1.1.4322;+TencentTraveler)',
'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+2.0.50727;+.NET+CLR+3.0.4506.2152;+.NET+CLR+3.5.30729)',    'Mozilla/5.0+(Windows+NT+5.1)+AppleWebKit/537.1+(KHTML,+like+Gecko)+Chrome/21.0.1180.89+Safari/537.1',
'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1)',
'Mozilla/5.0+(Windows+NT+6.1;+rv:11.0)+Gecko/20100101+Firefox/11.0',
'Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+SV1)',
'Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+GTB7.1;+.NET+CLR+2.0.50727)',
'Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+KB974489)',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36',
53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121    'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36',
]
ua =random.choice(uaList)
return ua
'''提取正则'''
def search(re_url,html):
re_Data=re.findall(re_url,html)
if re_Data:
return re_Data[0]
else:
return'no'
'''轮换ip'''
def ip():
for x in open(''):
x =x.strip()
daili_list.append(x)
做好的html怎么发布newip =random.choice(daili_list)
return newip
'''获取html'''
def gethtml(url,headers):
while1:
try:
newip=ip()
proxies={"http":"%s"%newip.strip()}
pages=requests.post(url,headers,proxies,timeout=10)
t
code=pages.status_code
if'404''302 Found'in html or code !=200in html:
print u'代理失效重试'
continue
elif'verify'in html:
print u'出验证码,重试'
continue
else:
return html
except Exception, e:
# print e
continue
'''正则⽤以提取列表页上的url,需根据实际情况来调整'''
re_url=repile(r'<a href="(www\.example\/.*?\d+\.html)"')
'''正则⽤以提取内页上的title和正⽂内容content,当然也可以使⽤readability模块,正则需根据实际情况做修改'''
re_title_content=repile(r'<h1 class="entry-title">(.*?)</h1>[\s\S]*?<div class="entry-content">([\s\S]*?)<div class="clear">') '''成功通过wordpress-xmlrpc模块⾃动发布⽂章到wordpress'''
def getData(url):
headers={'User-Agent':'%s'%getUA(),}
mutex.acquire()
html=gethtml(url,headers)
re_Data=re.findall(re_url,html)
for i in re_Data:
i=i.strip()
if i not in url_list and i not in urls:
page=gethtml(i,headers)
page_Data=re.findall(re_title_content,page)
for n in page_Data:
# print type(n)
try:
title=n[0]
content=filter_tags(n[1]) `
except:
title=0
content=0
if title and content:
print title,content
'''发布到wp'''
# post.title=title
# t=content
121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158                # t=content
# post.post_status = 'publish'
# wp.call(NewPost(post))
url_list.append(i)
f.writelines(i+'\n')
print'Updates'
else:
pass
else:
print'Noposts updates'
continue
def now_time(url):
for i in url_list:
getData(i)
url_list =[]
for line in range(1,12):
line ='ample/page/%d'%line    word =line.strip()
url_list.append(word)
mutex =threading.Lock()
pool =threadpool.ThreadPool(3)
reqs =threadpool.makeRequests(now_time, url_list) [pool.putRequest(req) for req in reqs]
pool.wait()
设置采集内容到哪个默认⽬录,可以在wp后台设置,从代码上做修改也可以;具体可以看看xmlrpc官⽅⽂档:adthedocs.io/en/latest/overview.html
另外可以通过命令crontab -e 让脚本按需⾃动跑起来!

版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。