辛苦了一陣子,至少現在不會出現錯誤啦!不過資料好像不對耶...得確認一下儲存方法對不對了!
#! /usr/bin/env python
#-*- coding: utf-8 -*-
import urllib.request
import re
import time
class MyWebsiteSEO():
def __init__(self, targetUrl=""):
self.targetUrl = targetUrl
self.urlobj = ""
self.data = ""
self.TagList = []
self.TimeUse = ""
self.siteMap = []
def get_css_len(self, targetUrl, TagList ):
targetTag=[ "link" ]
targetAttr=[ "href" ]
resultList = get_attr( TagList, targetTag, targetAttr )
cssLen = 0
for x in resultList :
if re.search( r'^http' , x ):
data = get_url_data( x )
cssLen += len( str( data.decode( "utf-8" ) ) )
elif re.search( r'^\/[\w]' , x ):
data = get_url_data( targetUrl + x[1:] )
cssLen += len( str( data.decode( "utf-8" ) ) )
elif re.search( r'^\/\.\.' , x ):
pass
return cssLen
def get_js_len(self, targetUrl, TagList ):
targetTag=[ "script" ]
targetAttr=[ "src" ]
resultList = get_attr( TagList, targetTag, targetAttr )
jsLen = 0
for x in resultList :
if re.search( r'^http' , x ):
data = get_url_data( x )
jsLen += len( str( data.decode( "utf-8" ) ) )
elif re.search( r'^\/[\w]' , x ):
data = get_url_data( targetUrl + x[1:] )
jsLen += len( str( data.decode( "utf-8" ) ) )
elif re.search( r'^\/\.\.' , x ):
pass
return jsLen
def get_attr(self, TagList, targetTag=[ "a" ], targetAttr=[ "href" ] ):
resultList = []
for x in targetAttr:
for y in TagList :
if y[0] in targetTag :
Attr = y[1].split(" ")
for z in Attr:
IgCh = [ '"' , "'" ] #去除前後分號
if re.search( r'^'+ x +'=' , z ):
if z[(len(x)+1)] in IgCh: #判斷是否有分號
resultList.append( z[(len(x)+2):-1] )
else:
resultList.append( z[(len(x)+1):] )
return resultList
def get_sitemap( self ):
siteMap = []
self.siteMap.append( self.targetUrl )
ptr = 0
while True:
try :
nextUrl = self.siteMap[ptr]
if not re.search( r'^http',nextUrl ): #相對路徑
#應該要判斷 ../../../ 這種路徑
nextUrl = self.targetUrl + nextUrl
data = self.get_url_data( nextUrl, True )
TagList = get_tag( data, True )
alinkList = get_link( TagList, True )
#alink_to_sitMap( alinkList, self.targetUrl )
for x in alinkList :
if re.search( r'^#', x ) : #略過hash
continue
rehostUrl = "^" + self.targetUrl
if re.search( rehostUrl, x ): #同一domin判斷
if not x in self.siteMap: #避免重複
self.siteMap.append( x )
if not re.search( r'^http', x ): #相對路徑
if not x in self.siteMap: #避免重複
self.siteMap.append( x )
ptr += 1
except:
break
#self.siteMap = siteMap
#just for sitemap function
def get_link( self, TagList, flag = False ):
targetList = [ "a" ]
alinkList = []
for y in TagList :
if y[0] in targetList : #尋找<a>標籤
Attr = y[1].split(" ")
for z in Attr:
IgCh = [ '"' , "'" ] #去除前後分號
if re.search( r'^href=' , z ):
if z[5] in IgCh: #判斷是否有分號
alinkList.append( z[6:-1] )
else:
alinkList.append( z[5:] )
return alinkList
def get_tag( self, data, flag = False ):
reStr = "<([\w]+) (.*?)>";
reObj = re.compile( reStr.format() )
targetTag = reObj.findall( str( data.decode( "utf-8" ) ) )
TagList=[]
for x in targetTag:
TagList.append( x )
if( flag ):
return TagList
else:
self.TagList = TagList
def get_url_data( self, flag = False ):
urlobj = urllib.request.urlopen( self.targetUrl )
data = urlobj.read()
urlobj.close()
if( flag ):
return data
else:
self.data = data
def seo_analyze( self, targetUrl, level="4" ):
self.targetUrl = targetUrl
TimeStart = time.time()
self.get_url_data()
TimeEnd = time.time()
TimeUse = TimeEnd - TimeStart
self.TimeUse = round( TimeUse, 4 )
self.get_tag(self.data)
self.get_sitemap()
return self.siteMap
def test():
#targetUrl = "http://192.168.1.106/"
targetUrl = "https://www.python.org/"
websiteobj = MyWebsiteSEO()
ResultData = websiteobj.seo_analyze( targetUrl ) #seo_analyze( url [,level] )
print (ResultData)
#ResultData has 1.表頭資料 2.回應時間 3.sitemap 4.img alt 5.css js 回應時間
if __name__ == "__main__":
test()
明天繼續!〈先生,你這樣來的及嗎?OS:沒加班的話...〉
以上,晚安!