olivia_finder.data_source.repository_scrapers.pypi

  1import re
  2from typing import List, Optional
  3import requests
  4from typing_extensions import override
  5from bs4 import BeautifulSoup
  6from ..scraper_ds import ScraperDataSource
  7from ...myrequests.request_handler import RequestHandler
  8from ...myrequests.job import RequestJob
  9from ...utilities.exception import OliviaFinderException
 10
 11class PypiScraper(ScraperDataSource):
 12    ''' 
 13    Class that scrapes the PyPI website to obtain information about Python packages
 14    Implements the abstract class Scraper and accordingly DataSource class
 15    
 16    Attributes
 17    ----------
 18    PYPI_PACKAGE_LIST_URL : str
 19        URL of the PyPI website where the list of packages is located
 20    PYPI_PACKAGE_DATA_URL : str
 21        URL of the PyPI website where the data of a package is located
 22    '''
 23
 24    # Class variables  
 25    PYPI_PACKAGE_LIST_URL: str  = "https://pypi.org/simple/"
 26    PYPI_PACKAGE_DATA_URL: str  = "https://pypi.org/pypi/"
 27
 28
 29    def __init__(self, request_handler: Optional[RequestHandler] = None):
 30        '''
 31        Constructor
 32        '''
 33
 34        super().__init__(request_handler)
 35
 36    @override
 37    def obtain_package_names(self) -> List[str]:
 38        '''
 39        Obtain the list of packages names from the PyPI website
 40        Implements the abstract method of DataSource class
 41
 42        Returns
 43        -------
 44        List[str]
 45            List of packages names
 46            
 47        Handles
 48        -------
 49        Exception
 50            If there is an error obtaining the list of packages, it returns an empty list
 51            
 52        Example
 53        -------
 54        >>> pypi_scraper = PypiScraper()
 55        >>> pypi_scraper.obtain_package_names()
 56        ['package1', 'package2', ...]
 57        '''
 58
 59        # Build the request job
 60        job = RequestJob("PYPI package names", self.PYPI_PACKAGE_LIST_URL)
 61
 62        # Get the HTML of the page
 63        job = self.request_handler.do_request(job)
 64
 65        if job.response is None:
 66            raise OliviaFinderException(f'Error obtaining the list of packages from {self.PYPI_PACKAGE_LIST_URL}')
 67        
 68        soup = BeautifulSoup(job.response.text, 'html.parser')
 69        
 70        pakage_list = []
 71        try:
 72            # Get the list of packages
 73            pakage_list = [a.text for a in soup.find_all('a')]
 74        except Exception as e:
 75            self.logger.error(f'Error obtaining the list of packages from {self.PYPI_PACKAGE_LIST_URL}')
 76
 77        self.logger.info(f'Obtained {len(pakage_list)} packages from {self.PYPI_PACKAGE_LIST_URL}')
 78        return pakage_list
 79    
 80    @override
 81    def _build_url(self, package_name: str) -> str:
 82        '''
 83        Build the URL to scrape a package
 84        Implements the abstract method of Scraper class
 85        Parameters
 86        ----------
 87        pkg_name : str
 88            Name of the package
 89
 90        Returns
 91        -------
 92        str
 93            URL to scrape
 94        '''
 95        return f'{self.PYPI_PACKAGE_DATA_URL}{package_name}/json'
 96
 97    @override
 98    def _parser(self, response: requests.Response) -> dict:
 99        '''
100        Parse the JSON data of a package and return the package data as a dictionary
101        
102        Parameters
103        ----------
104        response : requests.Response
105            Response of the request to the package data URL
106        
107        Returns
108        -------
109        dict
110            dictionary with the package data in the following format:
111            {
112                'name': name: str,
113                'version': version: str,
114                'url': url: str,
115                'dependencies': dependencies: List[str]
116            }
117        '''
118        # Parse the JSON
119        data = response.json()
120
121        # Get the dependencies if they exist, build the list of dependencies as Package objects
122
123        dependencies = []
124        if data['info']['requires_dist'] is not None:
125            dependencies_raw = data['info']['requires_dist']
126            
127            # Build a dictionary with the dependencies to avoid duplicates with different versions
128            dependencies_dict = {}
129            
130            for dependency in dependencies_raw:
131                
132                # # Split the dependency in name and version
133                # dependency_data = dependency.split(' ')
134
135                # # check if the dependency has a version
136                # if len(dependency_data) == 1:
137                #     dependencies_Dict[dependency_data[0]] = ""
138                # else:
139                #     # Add the dependency to the dictionary
140                #     # dependency_name: dependency_version
141                #     dependencies_Dict[dependency_data[0]] = dependency_data[1]
142
143                # Get the name of the dependency
144                dependency_name = self._clean_name(dependency)
145                dependencies_dict[dependency_name] = None
146                
147            # Build the list of dependencies as dictionaries
148            dependencies = [{'name': name, 'version': version} for name, version in dependencies_dict.items()]
149            
150            
151        # Build the dictionary and return it
152        return {
153            'name': data['info']['name'],
154            'version': data['info']['version'],
155            'url': data['info']['project_url'],
156            'dependencies': dependencies,
157        }    
158
159    def _clean_name(self, name: str) -> str:
160        '''
161        Clean the package name from versions and other characters
162
163        Parameters
164        ----------
165        name : str
166            Name of the package
167        
168        Returns
169        -------
170        str
171            Cleaned name of the package
172            
173        '''
174        
175        regex = re.compile(r'[^\w\d]+')
176
177        # Reemplazar cualquier coincidencia de la expresión regular en la cadena de dependencia con un espacio en blanco
178        name = regex.sub(' ', name)
179
180        # Obtenga el primer elemento de la lista resultante después de dividir la cadena por espacios en blanco
181        name = name.split()[0]
182
183        return name
 12class PypiScraper(ScraperDataSource):
 13    ''' 
 14    Class that scrapes the PyPI website to obtain information about Python packages
 15    Implements the abstract class Scraper and accordingly DataSource class
 16    
 17    Attributes
 18    ----------
 19    PYPI_PACKAGE_LIST_URL : str
 20        URL of the PyPI website where the list of packages is located
 21    PYPI_PACKAGE_DATA_URL : str
 22        URL of the PyPI website where the data of a package is located
 23    '''
 24
 25    # Class variables  
 26    PYPI_PACKAGE_LIST_URL: str  = "https://pypi.org/simple/"
 27    PYPI_PACKAGE_DATA_URL: str  = "https://pypi.org/pypi/"
 28
 29
 30    def __init__(self, request_handler: Optional[RequestHandler] = None):
 31        '''
 32        Constructor
 33        '''
 34
 35        super().__init__(request_handler)
 36
 37    @override
 38    def obtain_package_names(self) -> List[str]:
 39        '''
 40        Obtain the list of packages names from the PyPI website
 41        Implements the abstract method of DataSource class
 42
 43        Returns
 44        -------
 45        List[str]
 46            List of packages names
 47            
 48        Handles
 49        -------
 50        Exception
 51            If there is an error obtaining the list of packages, it returns an empty list
 52            
 53        Example
 54        -------
 55        >>> pypi_scraper = PypiScraper()
 56        >>> pypi_scraper.obtain_package_names()
 57        ['package1', 'package2', ...]
 58        '''
 59
 60        # Build the request job
 61        job = RequestJob("PYPI package names", self.PYPI_PACKAGE_LIST_URL)
 62
 63        # Get the HTML of the page
 64        job = self.request_handler.do_request(job)
 65
 66        if job.response is None:
 67            raise OliviaFinderException(f'Error obtaining the list of packages from {self.PYPI_PACKAGE_LIST_URL}')
 68        
 69        soup = BeautifulSoup(job.response.text, 'html.parser')
 70        
 71        pakage_list = []
 72        try:
 73            # Get the list of packages
 74            pakage_list = [a.text for a in soup.find_all('a')]
 75        except Exception as e:
 76            self.logger.error(f'Error obtaining the list of packages from {self.PYPI_PACKAGE_LIST_URL}')
 77
 78        self.logger.info(f'Obtained {len(pakage_list)} packages from {self.PYPI_PACKAGE_LIST_URL}')
 79        return pakage_list
 80    
 81    @override
 82    def _build_url(self, package_name: str) -> str:
 83        '''
 84        Build the URL to scrape a package
 85        Implements the abstract method of Scraper class
 86        Parameters
 87        ----------
 88        pkg_name : str
 89            Name of the package
 90
 91        Returns
 92        -------
 93        str
 94            URL to scrape
 95        '''
 96        return f'{self.PYPI_PACKAGE_DATA_URL}{package_name}/json'
 97
 98    @override
 99    def _parser(self, response: requests.Response) -> dict:
100        '''
101        Parse the JSON data of a package and return the package data as a dictionary
102        
103        Parameters
104        ----------
105        response : requests.Response
106            Response of the request to the package data URL
107        
108        Returns
109        -------
110        dict
111            dictionary with the package data in the following format:
112            {
113                'name': name: str,
114                'version': version: str,
115                'url': url: str,
116                'dependencies': dependencies: List[str]
117            }
118        '''
119        # Parse the JSON
120        data = response.json()
121
122        # Get the dependencies if they exist, build the list of dependencies as Package objects
123
124        dependencies = []
125        if data['info']['requires_dist'] is not None:
126            dependencies_raw = data['info']['requires_dist']
127            
128            # Build a dictionary with the dependencies to avoid duplicates with different versions
129            dependencies_dict = {}
130            
131            for dependency in dependencies_raw:
132                
133                # # Split the dependency in name and version
134                # dependency_data = dependency.split(' ')
135
136                # # check if the dependency has a version
137                # if len(dependency_data) == 1:
138                #     dependencies_Dict[dependency_data[0]] = ""
139                # else:
140                #     # Add the dependency to the dictionary
141                #     # dependency_name: dependency_version
142                #     dependencies_Dict[dependency_data[0]] = dependency_data[1]
143
144                # Get the name of the dependency
145                dependency_name = self._clean_name(dependency)
146                dependencies_dict[dependency_name] = None
147                
148            # Build the list of dependencies as dictionaries
149            dependencies = [{'name': name, 'version': version} for name, version in dependencies_dict.items()]
150            
151            
152        # Build the dictionary and return it
153        return {
154            'name': data['info']['name'],
155            'version': data['info']['version'],
156            'url': data['info']['project_url'],
157            'dependencies': dependencies,
158        }    
159
160    def _clean_name(self, name: str) -> str:
161        '''
162        Clean the package name from versions and other characters
163
164        Parameters
165        ----------
166        name : str
167            Name of the package
168        
169        Returns
170        -------
171        str
172            Cleaned name of the package
173            
174        '''
175        
176        regex = re.compile(r'[^\w\d]+')
177
178        # Reemplazar cualquier coincidencia de la expresión regular en la cadena de dependencia con un espacio en blanco
179        name = regex.sub(' ', name)
180
181        # Obtenga el primer elemento de la lista resultante después de dividir la cadena por espacios en blanco
182        name = name.split()[0]
183
184        return name

Class that scrapes the PyPI website to obtain information about Python packages Implements the abstract class Scraper and accordingly DataSource class

Attributes
  • PYPI_PACKAGE_LIST_URL (str): URL of the PyPI website where the list of packages is located
  • PYPI_PACKAGE_DATA_URL (str): URL of the PyPI website where the data of a package is located
PypiScraper( request_handler: Optional[olivia_finder.myrequests.request_handler.RequestHandler] = None)
30    def __init__(self, request_handler: Optional[RequestHandler] = None):
31        '''
32        Constructor
33        '''
34
35        super().__init__(request_handler)

Constructor

@override
def obtain_package_names(self) -> List[str]:
37    @override
38    def obtain_package_names(self) -> List[str]:
39        '''
40        Obtain the list of packages names from the PyPI website
41        Implements the abstract method of DataSource class
42
43        Returns
44        -------
45        List[str]
46            List of packages names
47            
48        Handles
49        -------
50        Exception
51            If there is an error obtaining the list of packages, it returns an empty list
52            
53        Example
54        -------
55        >>> pypi_scraper = PypiScraper()
56        >>> pypi_scraper.obtain_package_names()
57        ['package1', 'package2', ...]
58        '''
59
60        # Build the request job
61        job = RequestJob("PYPI package names", self.PYPI_PACKAGE_LIST_URL)
62
63        # Get the HTML of the page
64        job = self.request_handler.do_request(job)
65
66        if job.response is None:
67            raise OliviaFinderException(f'Error obtaining the list of packages from {self.PYPI_PACKAGE_LIST_URL}')
68        
69        soup = BeautifulSoup(job.response.text, 'html.parser')
70        
71        pakage_list = []
72        try:
73            # Get the list of packages
74            pakage_list = [a.text for a in soup.find_all('a')]
75        except Exception as e:
76            self.logger.error(f'Error obtaining the list of packages from {self.PYPI_PACKAGE_LIST_URL}')
77
78        self.logger.info(f'Obtained {len(pakage_list)} packages from {self.PYPI_PACKAGE_LIST_URL}')
79        return pakage_list

Obtain the list of packages names from the PyPI website Implements the abstract method of DataSource class

Returns
  • List[str]: List of packages names
Handles

Exception If there is an error obtaining the list of packages, it returns an empty list

Example
>>> pypi_scraper = PypiScraper()
>>> pypi_scraper.obtain_package_names()
['package1', 'package2', ...]