今天整理的程式碼,主要進度為新增網站回應時間,取得sitemap
def get_sitemap( self );
siteMap.append( self.targetUrl )
ptr = 0
while True:
try :
nextUrl = siteMap[ptr]
if not re.search( r'^http',nextUrl ): #相對路徑
#應該要判斷 ../../../ 這種路徑
nextUrl = self.targetUrl + nextUrl
data = self.get_url_data( nextUrl, True )
TagList = get_tag( data, True )
alinkList = get_link( TagList, True )
#alink_to_sitMap( alinkList, self.targetUrl )
for x in alinkList :
if re.search( r'^#', x ) : #略過hash
continue
rehostUrl = "^" + self.targetUrl
if re.search( rehostUrl, x ): #同一domin判斷
if not x in siteMap: #避免重複
siteMap.append( x )
if not re.search( r'^http', x ): #相對路徑
if not x in siteMap: #避免重複
siteMap.append( x )
ptr += 1
except:
break
self.siteMap = siteMap
#just for sitemap function
def get_link( self, TagList ):
targetList = [ "a" ]
alinkList = []
for y in TagList :
if y[0] in targetList : #尋找<a>標籤
Attr = y[1].split(" ")
for z in Attr:
IgCh = [ '"' , "'" ] #去除前後分號
if re.search( r'^href=' , z ):
if z[5] in IgCh: #判斷是否有分號
alinkList.append( z[6:-1] )
else:
alinkList.append( z[5:] )
return alinkList
def get_tag( self, data = self.data flag = False ):
reStr = "<([\w]+) (.*?)>";
reObj = re.compile( reStr.format() )
targetTag = reObj.findall( str( data.decode( "utf-8" ) ) )
for x in targetTag:
TagList.append( x )
if( flag ):
return TagList
else:
self.TagList = TagList
def get_url_data( self, flag = False ):
urlobj = urllib.request.urlopen( self.targetUrl )
data = urlobj.read()
urlobj.close()
if( flag ):
return data
else:
self.data = data
def seo_analyze( targetUrl, level="4" ):
self.targetUrl = targetUrl
TimeStart = time.time()
self.get_url_data()
TimeEnd = time.time()
TimeUse = TimeEnd - TimeStart
self.TimeUse = round( TimeUse, 4 )
self.get_tag()
self.get_sitemap()
已上,晚安!