python 采集m3u8视频

初学python 很多高级的东西都还不知道,多以代码很臃肿

其实下视频最简单的方法就是用手机浏览器,像夸克啊之类的,自带的播放器打开视频播放网站,然后点击浏览器提供的下载功能,他其实也是通过解析网站的m3u8下载

但是问题在于公司的无线网把我们的外网都墙了,流量又限速,又是个影迷,就想着用电脑下好视频,传到手机上看。走起

思路:

1、先找到一个在线影院首页或搜索接口(不是腾优爱的那种,也不是某一影片的播放地址);

2、分析该影院的页面、网络接口;

3、开始撸代码

4、让用户输入需要搜索并下载的影片名称

5、请求接口,展现所有搜索到的影片

6、让用户输入需要下载搜索到的哪一个

7、以影片名称在代码文件夹创建文件夹

8、开始下载

实现功能:

1、影院不挂,程序就不挂,且不对单一影片下载,支持用户输入任意影片

2、下载方式为单线程,边下载便合并为mp4,因为我尝试过使用多线程下载,确实会变快很多,但是后期合并各个线程下载的视频时,简直龟速

3、记录下载集数以及下载ts片段数,所以支持断点续下

完成代码:

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

from bs4 import BeautifulSoup

import requests

import re

from Crypto.Cipher import AES

import os

def tsList(Index):   

    with open(os.getcwd()+ '/m3u8.txt','r') as f:

        if '.ts' in f.read():

            print('ts视频链接均已存储,无需重复请求')

        else:

            print('开始获取并存储ts链接')

            with open(os.getcwd()+ '/m3u8.txt','r') as f:

                m3u8Url = f.readlines()[0].strip()

            

            content = requests.get(m3u8Url,headers=headers)

            jiami=re.findall('#EXT-X-KEY:(.*)\n',content.text)

            m3u8Url_before = ''

            if len(jiami)>0:

                key=str(re.findall('URI="(.*)"',jiami[0]))[2:-2]

                if 'http' not in key:

                    m3u8Start = m3u8Url.find("\"url\":\"")+7

                    m3u8End = m3u8Url.find(".m3u8")+5

                    m3u8Url = m3u8Url[m3u8Start:m3u8End].replace('\\','')

                    m3u8Url_before="https://"  + m3u8Url.split('/')[2]

                else:

                    m3u8Url_before = ''

                keycontent= requests.get(m3u8Url_before + key,headers).text

                with open(os.getcwd()+ '/m3u8.txt','a') as f:

                    f.write(keycontent + '\n')

            else:

                with open(os.getcwd()+ '/m3u8.txt','a') as f:

                    f.write('000000000000')

            if(content.status_code == 200):

                pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')

                content = content.text.split(",")

                index = 0

                for item in content:

                    if 'http' not in item:

                        index += 1

                        temp = (m3u8Url_before+item).replace("\n", "")

                        url = str(pattern.findall(temp))[2:-2]

                        with open(os.getcwd()+ '/m3u8.txt','a') as f:

                            f.write(url + '\n')

                    else:

                        index += 1

                        url = str(pattern.findall(item))[2:-2]

                        with open(os.getcwd()+ '/m3u8.txt','a') as f:

                            f.write(url + '\n')

        Download(Index)

def Download(Index):

    now = videoName +  "_第" + str(Index+1) + '集'

    index = 0

    try:

        with open(os.getcwd()+ '/index.txt','r') as f3:

            index = int(float(f3.read()))

    except FileNotFoundError as e:

        index = 4

    index1 = index

    with open(os.getcwd()+ '/m3u8.txt','r') as getKey:

        keycontent = getKey.readlines()[1][0:-1]

        print(keycontent)

    if keycontent == '000000000000':

        print("未加密")

    else:

        cryptor = AES.new(keycontent.encode('utf-8'), AES.MODE_CBC, keycontent.encode('utf-8'))

    with open(os.getcwd()+ '/m3u8.txt','r') as getTsUrlList:

        tsList = getTsUrlList.readlines()[index:]

        tsListlen = len(tsList)

    for i in tsList[1:]:

        print("预估进度:" + str(index1-3) + '/' + str(tsListlen-1) + "   即将下载:" + i)

        res = ''

        for item in range(1,10):

            try:

                response = requests.get(i, headers=headers,timeout=3)

            except Exception as e:

                print(i[-12:-1] + '请求超时,重新请求第' + str(item) + '次')

                continue

            if(response.status_code == 200):

                print(i[-12:-1] + '请求成功')

                res = response

                

                if keycontent == '000000000000':

                    print('未加密,直接追加')

                    cont=res.content

                else:

                    try:

                        cont=cryptor.decrypt(res.content)

                    except:

                        pass

                

                with open(os.getcwd() + '/' + now +'.mp4', 'ab+') as f:

                    f.write(cont)

                    f.close()

                    index1 += 1

                    print("追加完成,已标记")

                with open(os.getcwd()+ '/index.txt','w') as f:

                    f.write(str(index1))

                    f.close()

                break

            else:

                continue

    

    with open(os.getcwd()+ '/jishu.txt','w') as indexFile:

        indexFile.write(str(Index+1) + ' \n')

    with open(os.getcwd()+ '/m3u8.txt','a') as indexFile:

        indexFile.truncate(0)

    with open(os.getcwd()+ '/index.txt','w') as f:

        f.write('0')

        f.close()

    return True

def getM3u8(htmlUrl):

    content = requests.get(htmlUrl,headers=headers).text

    bsObj = BeautifulSoup(content,"html.parser")

    index = 0

    for scriptItem in bsObj.findAll("script"):

        index += 1

        if '.m3u8' in str(scriptItem):

            

            pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')

            m3u8Url = pattern.search(str(scriptItem)).group()

            global m3u8Url_before

            m3u8Url_before=getDomain(m3u8Url)      

            

            if 'hls' not in m3u8Url:

                content = requests.get(m3u8Url,headers=headers).text

                count = 0

                nList = []   

                for item in list(content):

                    count += 1

                    if item == '\n':

                        nList.append(count)

                m3u8Url_hls = str(content)[nList[-2]:nList[-1]]

                

                if 'http' not in m3u8Url_hls:

                    m3u8Url_hls = m3u8Url_before + m3u8Url_hls

                

            else:

                m3u8Url_hls = m3u8Url

                

            if 'ENDLIST' not in m3u8Url_hls:

                with open(os.getcwd()+ '/m3u8.txt','w') as f:

                    f.write(m3u8Url_hls)

                    return(1)

            else:

                content = requests.get(m3u8Url,headers=headers).text.split(",")

                with open(os.getcwd()+ '/m3u8.txt','a+') as f:

                    f.write('hello~~' + '\n')

                with open(os.getcwd()+ '/m3u8.txt','a+') as f:

                    f.write('000000000000\n')

                for i in content:

                    if '/' in i:

                        breakpoint = i[1:].index('\n')+1

                        with open(os.getcwd()+ '/m3u8.txt','a+') as f:

                            f.write(i[1:breakpoint] + '\n')

                return (2)

        else:

            if index == len(list(bsObj.findAll("script"))):

                print("在页面Script中未找到相关m3u8链接。。。")

                return (3)

def jiexiHtml(htmlUrl):

    content = requests.get(htmlUrl,headers=headers).text

    bsObj = BeautifulSoup(content,"html.parser")

    

    global videoName

    videoName = bsObj.find('h1',{'class','page-title'}).text

    videoItemHtml = bsObj.find('div',{'class':'scroll-content'}).findAll('a',href=re.compile("^(/ShowInfo/)((?!:).)*$"))

    with open(os.getcwd()+ '/' +videoName+'.txt','w') as indexFile:  

        indexFile.truncate(0)

    for i in videoItemHtml:

        item = str(i.attrs['href'])

        if 'http' not in item:

            Domain = getDomain(htmlUrl)

            with open(os.getcwd()+ '/' +videoName+'.txt','a') as indexFile:

                indexFile.write(Domain + item + '\n')

        else:

            with open(os.getcwd()+ '/' +videoName+'.txt','a') as indexFile:

                indexFile.write(item + '\n')

    print("indexFile已更新")

    try:

        with open(os.getcwd()+ '/jishu.txt','r') as indexFile:

            pass

    except FileNotFoundError as e:

        print("jishu不存在,已创建并置零")

        with open(os.getcwd()+ '/jishu.txt','a') as indexFile:

            indexFile.write('0 \n')

    with open(os.getcwd()+ '/' +videoName+'.txt','r') as indexFile:

        HtmlArr = indexFile.readlines()

    with open(os.getcwd()+ '/jishu.txt','r') as indexFile:

        jishu = int(indexFile.read())

        print("txt记录集数为:" + str(jishu))

    for htmlItem in HtmlArr[jishu:]:

        with open(os.getcwd()+ '/jishu.txt','r') as indexFile:

            jishuRun = int(indexFile.read())

        print("txt记录集数为:" + str(jishuRun))

        openHtml(htmlItem,jishuRun)

def openHtml(htmlItem,jishu):

    print("当前打开链接:" + htmlItem)

    pd = getM3u8(htmlItem)

    if(pd == 1):

        tsList(jishu)

    elif (pd == 2):

        if Download(jishu):

            return True

    else:

        pass

         

def getDomain(htmlUrl):

    pattern = re.compile(r'http[s]?://[a-zA-Z\-.0-9]+(?=\/)')    

    return(str(pattern.search(htmlUrl).group()))

def searchVideo(searchName):

    content = requests.get('你找的影院地址',headers=headers,params={'searchword': searchName})

    bsObj = BeautifulSoup(content.text,"html.parser")

    VideoList = bsObj.findAll("a",{"class":"module-item-pic"})

    if VideoList:

        htmlUrlList = VideoList

        return htmlUrlList

    else:

        return False

if __name__ == '__main__':

    searchName = input('输入想搜索的剧名:')

    global headers

    headers = {

        

    }

    htmlUrlList = searchVideo(searchName)

    if htmlUrlList:

        li = 0

        for i in htmlUrlList:

            li += 1

            title = i.find('img',{"class":"lazyload"}).attrs['alt']

            print(str(li) + '、' + title +'\n')

    else:

        print("未搜索到相关影片")

        try:

            raise RuntimeError('testError')

        except RuntimeError as e:

            print("程序即将中断")

            os._exit(0)

    liNo = input('选择需要尝试下载的影片的编号:')

    if htmlUrlList[int(liNo)-1]:

        htmlUrl = htmlUrlList[int(liNo)-1].attrs['href']

        if os.path.exists(os.getcwd()+ '/' + searchName):

            pass

        else:

            

            os.makedirs(os.getcwd()+ '/' + searchName)

        

        os.chdir( os.getcwd()+ '/' + searchName )

        jiexiHtml('你找的影院地址' + htmlUrl)

    else:

        print("数值超出范围")

  ps:代码中的影院地址需要屏幕前的你自己找啦,其实随便找找都行,需要注意的是,至少需要同步修改的有:jiexiHtml方法中的标签名、searchVideo方法中搜索影片的参数名

贴图: