olivia_finder.data_source.repository_scrapers.pypi
1import re 2from typing import List, Optional 3import requests 4from typing_extensions import override 5from bs4 import BeautifulSoup 6from ..scraper_ds import ScraperDataSource 7from ...myrequests.request_handler import RequestHandler 8from ...myrequests.job import RequestJob 9from ...utilities.exception import OliviaFinderException 10 11class PypiScraper(ScraperDataSource): 12 ''' 13 Class that scrapes the PyPI website to obtain information about Python packages 14 Implements the abstract class Scraper and accordingly DataSource class 15 16 Attributes 17 ---------- 18 PYPI_PACKAGE_LIST_URL : str 19 URL of the PyPI website where the list of packages is located 20 PYPI_PACKAGE_DATA_URL : str 21 URL of the PyPI website where the data of a package is located 22 ''' 23 24 # Class variables 25 PYPI_PACKAGE_LIST_URL: str = "https://pypi.org/simple/" 26 PYPI_PACKAGE_DATA_URL: str = "https://pypi.org/pypi/" 27 28 29 def __init__(self, request_handler: Optional[RequestHandler] = None): 30 ''' 31 Constructor 32 ''' 33 34 super().__init__(request_handler) 35 36 @override 37 def obtain_package_names(self) -> List[str]: 38 ''' 39 Obtain the list of packages names from the PyPI website 40 Implements the abstract method of DataSource class 41 42 Returns 43 ------- 44 List[str] 45 List of packages names 46 47 Handles 48 ------- 49 Exception 50 If there is an error obtaining the list of packages, it returns an empty list 51 52 Example 53 ------- 54 >>> pypi_scraper = PypiScraper() 55 >>> pypi_scraper.obtain_package_names() 56 ['package1', 'package2', ...] 57 ''' 58 59 # Build the request job 60 job = RequestJob("PYPI package names", self.PYPI_PACKAGE_LIST_URL) 61 62 # Get the HTML of the page 63 job = self.request_handler.do_request(job) 64 65 if job.response is None: 66 raise OliviaFinderException(f'Error obtaining the list of packages from {self.PYPI_PACKAGE_LIST_URL}') 67 68 soup = BeautifulSoup(job.response.text, 'html.parser') 69 70 pakage_list = [] 71 try: 72 # Get the list of packages 73 pakage_list = [a.text for a in soup.find_all('a')] 74 except Exception as e: 75 self.logger.error(f'Error obtaining the list of packages from {self.PYPI_PACKAGE_LIST_URL}') 76 77 self.logger.info(f'Obtained {len(pakage_list)} packages from {self.PYPI_PACKAGE_LIST_URL}') 78 return pakage_list 79 80 @override 81 def _build_url(self, package_name: str) -> str: 82 ''' 83 Build the URL to scrape a package 84 Implements the abstract method of Scraper class 85 Parameters 86 ---------- 87 pkg_name : str 88 Name of the package 89 90 Returns 91 ------- 92 str 93 URL to scrape 94 ''' 95 return f'{self.PYPI_PACKAGE_DATA_URL}{package_name}/json' 96 97 @override 98 def _parser(self, response: requests.Response) -> dict: 99 ''' 100 Parse the JSON data of a package and return the package data as a dictionary 101 102 Parameters 103 ---------- 104 response : requests.Response 105 Response of the request to the package data URL 106 107 Returns 108 ------- 109 dict 110 dictionary with the package data in the following format: 111 { 112 'name': name: str, 113 'version': version: str, 114 'url': url: str, 115 'dependencies': dependencies: List[str] 116 } 117 ''' 118 # Parse the JSON 119 data = response.json() 120 121 # Get the dependencies if they exist, build the list of dependencies as Package objects 122 123 dependencies = [] 124 if data['info']['requires_dist'] is not None: 125 dependencies_raw = data['info']['requires_dist'] 126 127 # Build a dictionary with the dependencies to avoid duplicates with different versions 128 dependencies_dict = {} 129 130 for dependency in dependencies_raw: 131 132 # # Split the dependency in name and version 133 # dependency_data = dependency.split(' ') 134 135 # # check if the dependency has a version 136 # if len(dependency_data) == 1: 137 # dependencies_Dict[dependency_data[0]] = "" 138 # else: 139 # # Add the dependency to the dictionary 140 # # dependency_name: dependency_version 141 # dependencies_Dict[dependency_data[0]] = dependency_data[1] 142 143 # Get the name of the dependency 144 dependency_name = self._clean_name(dependency) 145 dependencies_dict[dependency_name] = None 146 147 # Build the list of dependencies as dictionaries 148 dependencies = [{'name': name, 'version': version} for name, version in dependencies_dict.items()] 149 150 151 # Build the dictionary and return it 152 return { 153 'name': data['info']['name'], 154 'version': data['info']['version'], 155 'url': data['info']['project_url'], 156 'dependencies': dependencies, 157 } 158 159 def _clean_name(self, name: str) -> str: 160 ''' 161 Clean the package name from versions and other characters 162 163 Parameters 164 ---------- 165 name : str 166 Name of the package 167 168 Returns 169 ------- 170 str 171 Cleaned name of the package 172 173 ''' 174 175 regex = re.compile(r'[^\w\d]+') 176 177 # Reemplazar cualquier coincidencia de la expresión regular en la cadena de dependencia con un espacio en blanco 178 name = regex.sub(' ', name) 179 180 # Obtenga el primer elemento de la lista resultante después de dividir la cadena por espacios en blanco 181 name = name.split()[0] 182 183 return name
12class PypiScraper(ScraperDataSource): 13 ''' 14 Class that scrapes the PyPI website to obtain information about Python packages 15 Implements the abstract class Scraper and accordingly DataSource class 16 17 Attributes 18 ---------- 19 PYPI_PACKAGE_LIST_URL : str 20 URL of the PyPI website where the list of packages is located 21 PYPI_PACKAGE_DATA_URL : str 22 URL of the PyPI website where the data of a package is located 23 ''' 24 25 # Class variables 26 PYPI_PACKAGE_LIST_URL: str = "https://pypi.org/simple/" 27 PYPI_PACKAGE_DATA_URL: str = "https://pypi.org/pypi/" 28 29 30 def __init__(self, request_handler: Optional[RequestHandler] = None): 31 ''' 32 Constructor 33 ''' 34 35 super().__init__(request_handler) 36 37 @override 38 def obtain_package_names(self) -> List[str]: 39 ''' 40 Obtain the list of packages names from the PyPI website 41 Implements the abstract method of DataSource class 42 43 Returns 44 ------- 45 List[str] 46 List of packages names 47 48 Handles 49 ------- 50 Exception 51 If there is an error obtaining the list of packages, it returns an empty list 52 53 Example 54 ------- 55 >>> pypi_scraper = PypiScraper() 56 >>> pypi_scraper.obtain_package_names() 57 ['package1', 'package2', ...] 58 ''' 59 60 # Build the request job 61 job = RequestJob("PYPI package names", self.PYPI_PACKAGE_LIST_URL) 62 63 # Get the HTML of the page 64 job = self.request_handler.do_request(job) 65 66 if job.response is None: 67 raise OliviaFinderException(f'Error obtaining the list of packages from {self.PYPI_PACKAGE_LIST_URL}') 68 69 soup = BeautifulSoup(job.response.text, 'html.parser') 70 71 pakage_list = [] 72 try: 73 # Get the list of packages 74 pakage_list = [a.text for a in soup.find_all('a')] 75 except Exception as e: 76 self.logger.error(f'Error obtaining the list of packages from {self.PYPI_PACKAGE_LIST_URL}') 77 78 self.logger.info(f'Obtained {len(pakage_list)} packages from {self.PYPI_PACKAGE_LIST_URL}') 79 return pakage_list 80 81 @override 82 def _build_url(self, package_name: str) -> str: 83 ''' 84 Build the URL to scrape a package 85 Implements the abstract method of Scraper class 86 Parameters 87 ---------- 88 pkg_name : str 89 Name of the package 90 91 Returns 92 ------- 93 str 94 URL to scrape 95 ''' 96 return f'{self.PYPI_PACKAGE_DATA_URL}{package_name}/json' 97 98 @override 99 def _parser(self, response: requests.Response) -> dict: 100 ''' 101 Parse the JSON data of a package and return the package data as a dictionary 102 103 Parameters 104 ---------- 105 response : requests.Response 106 Response of the request to the package data URL 107 108 Returns 109 ------- 110 dict 111 dictionary with the package data in the following format: 112 { 113 'name': name: str, 114 'version': version: str, 115 'url': url: str, 116 'dependencies': dependencies: List[str] 117 } 118 ''' 119 # Parse the JSON 120 data = response.json() 121 122 # Get the dependencies if they exist, build the list of dependencies as Package objects 123 124 dependencies = [] 125 if data['info']['requires_dist'] is not None: 126 dependencies_raw = data['info']['requires_dist'] 127 128 # Build a dictionary with the dependencies to avoid duplicates with different versions 129 dependencies_dict = {} 130 131 for dependency in dependencies_raw: 132 133 # # Split the dependency in name and version 134 # dependency_data = dependency.split(' ') 135 136 # # check if the dependency has a version 137 # if len(dependency_data) == 1: 138 # dependencies_Dict[dependency_data[0]] = "" 139 # else: 140 # # Add the dependency to the dictionary 141 # # dependency_name: dependency_version 142 # dependencies_Dict[dependency_data[0]] = dependency_data[1] 143 144 # Get the name of the dependency 145 dependency_name = self._clean_name(dependency) 146 dependencies_dict[dependency_name] = None 147 148 # Build the list of dependencies as dictionaries 149 dependencies = [{'name': name, 'version': version} for name, version in dependencies_dict.items()] 150 151 152 # Build the dictionary and return it 153 return { 154 'name': data['info']['name'], 155 'version': data['info']['version'], 156 'url': data['info']['project_url'], 157 'dependencies': dependencies, 158 } 159 160 def _clean_name(self, name: str) -> str: 161 ''' 162 Clean the package name from versions and other characters 163 164 Parameters 165 ---------- 166 name : str 167 Name of the package 168 169 Returns 170 ------- 171 str 172 Cleaned name of the package 173 174 ''' 175 176 regex = re.compile(r'[^\w\d]+') 177 178 # Reemplazar cualquier coincidencia de la expresión regular en la cadena de dependencia con un espacio en blanco 179 name = regex.sub(' ', name) 180 181 # Obtenga el primer elemento de la lista resultante después de dividir la cadena por espacios en blanco 182 name = name.split()[0] 183 184 return name
Class that scrapes the PyPI website to obtain information about Python packages Implements the abstract class Scraper and accordingly DataSource class
Attributes
- PYPI_PACKAGE_LIST_URL (str): URL of the PyPI website where the list of packages is located
- PYPI_PACKAGE_DATA_URL (str): URL of the PyPI website where the data of a package is located
PypiScraper( request_handler: Optional[olivia_finder.myrequests.request_handler.RequestHandler] = None)
30 def __init__(self, request_handler: Optional[RequestHandler] = None): 31 ''' 32 Constructor 33 ''' 34 35 super().__init__(request_handler)
Constructor
@override
def
obtain_package_names(self) -> List[str]:
37 @override 38 def obtain_package_names(self) -> List[str]: 39 ''' 40 Obtain the list of packages names from the PyPI website 41 Implements the abstract method of DataSource class 42 43 Returns 44 ------- 45 List[str] 46 List of packages names 47 48 Handles 49 ------- 50 Exception 51 If there is an error obtaining the list of packages, it returns an empty list 52 53 Example 54 ------- 55 >>> pypi_scraper = PypiScraper() 56 >>> pypi_scraper.obtain_package_names() 57 ['package1', 'package2', ...] 58 ''' 59 60 # Build the request job 61 job = RequestJob("PYPI package names", self.PYPI_PACKAGE_LIST_URL) 62 63 # Get the HTML of the page 64 job = self.request_handler.do_request(job) 65 66 if job.response is None: 67 raise OliviaFinderException(f'Error obtaining the list of packages from {self.PYPI_PACKAGE_LIST_URL}') 68 69 soup = BeautifulSoup(job.response.text, 'html.parser') 70 71 pakage_list = [] 72 try: 73 # Get the list of packages 74 pakage_list = [a.text for a in soup.find_all('a')] 75 except Exception as e: 76 self.logger.error(f'Error obtaining the list of packages from {self.PYPI_PACKAGE_LIST_URL}') 77 78 self.logger.info(f'Obtained {len(pakage_list)} packages from {self.PYPI_PACKAGE_LIST_URL}') 79 return pakage_list
Obtain the list of packages names from the PyPI website Implements the abstract method of DataSource class
Returns
- List[str]: List of packages names
Handles
Exception If there is an error obtaining the list of packages, it returns an empty list
Example
>>> pypi_scraper = PypiScraper()
>>> pypi_scraper.obtain_package_names()
['package1', 'package2', ...]