Source code for pyscrapers.workers.pornhub

"""
Module to handle scraping of pornhub.

References:
- https://pypi.org/project/pornhub-api/
"""
import logging
import tempfile
from itertools import islice

import pornhub_api
from pornhub_api import PornhubApi

from pyscrapers.configs import ConfigPornhubSearch, ConfigUrl, ConfigDebugUrls
from pyscrapers.core.url_set import UrlSet
from pyscrapers.core.ext_lxml import get_html_dom_content, get_element_as_bytes
from pyscrapers.workers.youtube_dl_handlers import youtube_dl_download_url, youtube_dl_download_urls


















[docs] def get_code(e: ValueError) -> int: return int(e.args[0]["code"])
[docs] def get_number_of_pages(root) -> int: """ return number of pages for a pornstar :param root: :return: """ counters = root.xpath('//div[contains(@class,\'showingInfo\')]') assert len(counters) == 1 counter = counters[0] number = int(counter.text.strip().split()[3]) number_in_page = 36 number_of_pages = -(-number // number_in_page) return number_of_pages
[docs] def get_urls_from_page(root) -> list[str]: """ return urls from page :param root: :return: """ logger = logging.getLogger(__name__) if ConfigDebugUrls.save: with tempfile.NamedTemporaryFile(delete=False) as f: logger.info(f"writing file [{f.name}]") f.write(get_element_as_bytes(root)) xpath_picks = [ '//ul[@id=\'uploadedVideosSection\']', '//ul[@id=\'moreData\']', '//ul[@id=\'mostRecentVideosSection\']', '//ul[@id=\'showAllChanelVideos\']', '//ul[@id=\'pornstarsVideoSection\']', '//ul[@id=\'modelMostRecentVideosSection\']', # there are extras which we don't need (they provide extra # movies which have nothing to do with the page involved) # '//ul[@id=\'hottestMenuSection\']', # '//ul[@id=\'recommMenuSection\']', # '//ul[@id=\'claimedUploadedVideoSection\']', # '//ul[@id=\'claimedRecentVideoSection\']', # '//ul[@id=\'"videosUploadedSection\']', # '//ul[@id=\'"modelPaidClips\']', ] video_sections = [] for xpath_pick in xpath_picks: video_sections.extend(root.xpath(xpath_pick)) urls = [] for video_section in video_sections: elements = video_section.xpath('li[contains(@class,\'pcVideoListItem\')]') for element in elements: # pyscrapers.core.utils.print_element(element) # key = element.attrib['_vkey'] key = element.attrib['data-video-vkey'] url = f"https://www.pornhub.com/view_video.php?viewkey={key}" urls.append(url) return urls
[docs] def url_generator(url: str): yield url page = 2 while True: yield f"{url}?page={page}" page += 1
[docs] def download_url(session) -> None: logger = logging.getLogger(__name__) urls = UrlSet() for url in url_generator(url=ConfigUrl.url): logger.info(f"getting [{url}]...") response = session.get(url) if response.status_code != 200: logger.info(f"got code [{response.status_code}]...") break root = get_html_dom_content(response) new_urls = get_urls_from_page(root) if len(new_urls) == 0: break logger.info(f"got [{len(new_urls)}] new urls") for i, new_url in enumerate(new_urls): logger.info(f"url {i} is [{new_url}]") urls.extend(new_urls) session.close() logger.info(f"got total [{len(urls.urls_list)}] urls") logger.info(f"got [{urls.appended_twice}] appended twice urls") youtube_dl_download_urls(urls.urls_list)