import re
import time
import traceback
import requests_html
# 将js代码转为AST结构树
from slimit.parser import Parser
# 自定义访问者
from slimit.visitors.nodevisitor import ASTVisitor
# 判断类型
from slimit import ast
import os
requests = requests_html.HTMLSession()
flashvars = list()
class VarStatement_Visitor(ASTVisitor):
# 自定义访问者,重写VarStatement节点访问逻辑
def visit_VarStatement(self, node):
Identifier, Object = node.children()[0].children()
# 获取flashvars的节点
if 'flashvars' in Identifier.value:
for i in Object.properties:
left, right = i.children()
# mediaDefinitions数组
if left.value == '"mediaDefinitions"':
# 还原字典
for item in right.items:
media_data = dict()
for medias in item.properties:
media_left, media_right = medias.children()
if isinstance(media_right, ast.Array):
data_list = [i.value for i in media_right.items]
media_data[media_left.value[1:-1]] = data_list
else:
if media_left.value == '"defaultQuality"' or media_left.value == '"remote"':
media_data[media_left.value[1:-1]] = media_right.value
else:
media_data[media_left.value[1:-1]] = media_right.value[1:-1]
flashvars.append(media_data)
# 获取qualityItems的节点
if 'qualityItems' in Identifier.value:
for i in Object.items:
media_data = dict()
for medias in i.properties:
media_left, media_right, = medias.children()
if isinstance(media_right, ast.Number):
media_data[media_left.value[1:-1]] = media_right.value
else:
media_data[media_left.value[1:-1]] = media_right.value[1:-1].replace('\\/', '/')
flashvars.append(media_data)
class Media_Visitor(ASTVisitor):
def __init__(self, i, *args, **kwargs):
# 视频所在的序号
self.i = i
# 映射关系
self.identifier = {}
# 映射顺序
self.identifiers_list = []
super(*args, **kwargs)
# 递归获取映射顺序
def get_Identifier(self, node, identifiers_list):
left, right = node.children()
identifiers_list.append(self.identifier[right.value])
if isinstance(left, ast.BinOp):
self.get_Identifier(left, identifiers_list)
else:
identifiers_list.append(self.identifier[left.value])
def visit_VarStatement(self, node):
Identifier, BinOp = node.children()[0].children()
# 获取该函数地址的映射顺序
if 'media_' + str(self.i) in Identifier.value:
# 计算该视频的真实地址
self.get_Identifier(BinOp, self.identifiers_list)
# 填充视频地址
flashvars[self.i]['videoUrl'] = ''.join(self.identifiers_list[::-1])
# 根据映射关系进行恢复
elif isinstance(BinOp, ast.String) or (len(BinOp.children()) == 2 and isinstance(BinOp.children()[0], ast.String) and isinstance(BinOp.children()[1], ast.String)):
if isinstance(BinOp, ast.String):
self.identifier[Identifier.value] = BinOp.value[1:-1]
else:
self.identifier[Identifier.value] = ''.join([i.value[1:-1] for i in BinOp.children()])
def main(url):
response = requests.get(url=url)
script = filter(lambda x: 'flashvars' in x, response.html.xpath('//script//text()')).__next__()
# 将js代码转成结构树
tree = Parser().parse(script)
VarStatement_Visitor().visit(tree)
for i, _ in enumerate(flashvars):
Media_Visitor(i).visit(tree)
print(flashvars)