In some situations, such as this problem you can study a little the code of the page and make the calls that javascript would make.
Here’s a class I implemented that mimics this. It’s in Python 2.7. If you debug each function it’s easy to understand the path.
# anime.py
from bs4 import BeautifulSoup
import urllib2
import re
class Anime2MP4(object):
anime_zero_url = 'http://www.animesproject.com.br/serie/885/2162/Death-Parade-Episodio-00' # noqa
anime_url_format = 'http://www.animesproject.com.br/playerv52/player.php?a=0&0={0}&1={1}' # noqa
def build_episode_url(self, url_parameters):
return self.anime_url_format.format(*url_parameters)
def get_episodes_url(self):
webpage = urllib2.urlopen(self.anime_zero_url)
soup = BeautifulSoup(webpage)
id_tag = 'serie_lista_episodios'
episodes = soup.find(id=id_tag).find_all('a', href=True)
return [ep['href']for ep in episodes]
def get_parameters(self):
pars = []
for ep in self.episodes:
ep_split = ep.split('/')
pars.append((ep_split[2], ep_split[3]))
return pars
def get_mp4_episode(self, url, quality='MQ'):
"""
quality: Pode ser HD ou MQ
"""
webpage = urllib2.urlopen(url)
html_content = webpage.read()
pattern = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' # noqa
urls = set(re.findall(pattern, html_content)) # unique urls
urls = filter(lambda s: s.endswith('.mp4'), urls) # only .mp4
return next((url for url in urls if quality in url), None)
def run(self):
self.episodes = self.get_episodes_url()
list_episode = map(self.build_episode_url, self.get_parameters())
mp4_links = map(self.get_mp4_episode, list_episode)
for num, ep in enumerate(mp4_links):
print num, ep
if __name__ == '__main__':
anime = Anime2MP4()
anime.run()
The
urllib
only gets the markdown (i.e. the text) of the page, it even loads the external resources by itself (images, styles, scripts...). To do what you want, just using one browser even complete, which creates the DOM, runs Javascript, etc. I suggest taking a look at the ghost py. or in others browsers "headless" (i.e. without a graphical interface).– mgibsonbr