olivia_finder.data_source.scraper_ds

  1from __future__ import annotations
  2from abc import ABC, abstractmethod
  3from typing import Dict, List, Optional, Tuple, Union
  4import requests
  5import tqdm
  6from .data_source import DataSource
  7from ..myrequests.request_handler import RequestHandler
  8from ..myrequests.job import RequestJob
  9from ..utilities.exception import OliviaFinderException
 10import gc
 11
 12class ScraperDataSource(DataSource, ABC):
 13
 14    """
 15    Base class for custom scraping implementations of software repositories as CRAN, PyPI, etc.
 16    This class is an abstract class, so it cannot be instantiated.
 17    The subclasses must implement the abstract methods.
 18    This class is a subclass of the DataSource class.
 19    This class implements the methods _build_urls, obtain_package_data and obtain_packages_data
 20    
 21    """
 22    
 23    def __init__(
 24        self, 
 25        request_handler: Optional[RequestHandler] = None,
 26    ):
 27        """
 28        Constructor of the class
 29
 30        Parameters
 31        ----------
 32        request_handler : RequestHandler, optional
 33            Request handler for making the requests, by default None
 34        """
 35
 36        # Initialize the request handler, use the default one if None is passed
 37        self.request_handler = request_handler if request_handler is not None else RequestHandler()
 38
 39        # Initialize the not_found list for storing the packages that are not found
 40        self.not_found = []
 41
 42        # Initialize the logger
 43        super().__init__()
 44
 45
 46
 47    def _build_jobs(self, package_names:List[str]) -> List[RequestJob]:
 48        '''
 49        Build the jobs for scraping the packages of the package_names list
 50
 51        Parameters
 52        ----------
 53        package_names : List[str]
 54            List of package names to scrape
 55
 56        Returns
 57        -------
 58        List[RequestJob]
 59            List of jobs to scrape
 60        '''
 61
 62        return [RequestJob(package_name, self._build_url(package_name)) for package_name in package_names]
 63
 64    def obtain_packages_data(
 65        self, 
 66        package_names: Optional[List[str]] = None,
 67        progress_bar: Optional[tqdm.tqdm] = None,
 68    ) -> Tuple[List[Dict], List[str]]:
 69        '''
 70        Scrape a list of packages from a package manager, if the package is not found, it is added to the not_found list
 71        Overrides the method of the DataSource class
 72
 73        Parameters
 74        ----------
 75        package_names : Optional[List[str]], optional
 76            List of package names to scrape, if None, the package names are obtained from the data source, by default None
 77        progress_bar : Optional[tqdm.tqdm], optional
 78            Progress bar, by default None
 79
 80        Raises
 81        ------
 82        ScraperError
 83            If the list of package names is None or empty and full_scrape is disabled
 84    
 85        Returns
 86        -------
 87        Tuple[List[dict], List[str]]
 88            Tuple with the list of packages data and the list of packages not found
 89            
 90        Examples
 91        --------
 92        >>> scraper = Scraper()
 93        >>> scraper.obtain_packages_data(['numpy', 'pandas'])
 94        '''
 95
 96        # If package_names is None, obtain the package names from the data source
 97        if package_names is None or not package_names:
 98            self.logger.debug('Package names list is None or empty')
 99            self.logger.debug('Obtaining package names from data source')
100            package_names = self.obtain_package_names()
101
102        else:
103            self.logger.debug('Using package names from param list')
104
105        self.logger.debug(f'Total packages to scrape: {len(package_names)}')
106        self.logger.debug('Building jobs')
107        jobs = self._build_jobs(package_names)
108
109        # Do the requests with the RequestHandler whitout parallelization
110        self.logger.debug('Making requests')
111        finnalized_jobs = self.request_handler.do_requests(
112            request_jobs=jobs,
113            progress_bar=progress_bar
114        )
115
116        # Initialize the list of packages
117        packages = []
118        packages_keys = {}
119
120        # Initialize the list of packages not found
121        not_found = []
122
123        for finnalized_job in finnalized_jobs:
124
125            # If the response is None, the package is not found
126            if finnalized_job.response is None:
127                not_found.append(finnalized_job.key)
128                continue
129
130            # Parse the source data and add it to the list
131            if finnalized_job.key not in packages_keys:
132                packages_keys[finnalized_job.key] = True
133                package = self._parser(finnalized_job.response)
134                if package is not None and package != {}:
135                    packages.append(package)
136
137        # Clear the variables to save memory
138        del jobs
139        del finnalized_jobs
140        del packages_keys
141        del package_names
142        gc.collect()
143
144        # Return the packages and the packages not found
145        return packages, not_found
146    
147    def obtain_package_data(self, package_name: str) -> Union[dict, None]:
148        """
149        Obtain the data of a package from the web page of the package manager
150
151        Parameters
152        ----------
153        package_name : str
154            Name of the package to be scraped
155
156        Returns
157        -------
158        dict
159            dictionary with the data of the package
160        None
161            If the package is not found
162        """
163
164        # Make the request, the response is the second element of the tuple returned by do_request
165        self.logger.debug(f'Scraping package {package_name}')
166        request_job = self.request_handler.do_request(
167            RequestJob(package_name, self._build_url(package_name))
168        )
169
170        # Parse the response
171        if request_job.response is None:
172            
173            self.logger.debug(
174                f'Package {package_name} not found\n' +
175                f'Adding {package_name} to the not found list'
176            )
177            self.not_found.append(package_name)
178            return None
179        else:
180            package_data = self._parser(request_job.response)
181
182        # If the package is found, log it and return the package data
183        self.logger.debug(f'Package {package_name} scraped successfully')
184        return package_data 
185
186    # Abstract methods
187    # ----------------
188    # This methods should be implemented in the child class
189
190    @abstractmethod
191    def obtain_package_names(self) -> List[str]:
192        """
193        Obtain the package names from the web page of the package manager
194        it must handle exceptions and return an empty list if the package names cannot be obtained
195        To be implemented by the child class
196        
197        Raises
198        ------
199        NotImplementedError
200            Bcause the method is not implemented in the base class
201        """
202        raise NotImplementedError
203
204    @abstractmethod
205    def _build_url(self, package_name: str) -> str:
206        '''
207        Build the url for scraping a package
208        This method must be implemented by the child class
209        
210        Parameters
211        ----------
212        package_name : str
213            Name of the package to scrape
214
215        Returns
216        -------
217        str
218            Url to request the package data
219        '''
220        
221        raise NotImplementedError
222
223    @abstractmethod
224    def _parser(self, response: requests.Response) -> dict:
225        '''
226        Parse the response of the package page
227        This method must be implemented by the child class
228        
229        Parameters
230        ----------
231        response : requests.Response
232            Response of the package page
233
234        Returns
235        -------
236        dict
237            Package data as a dictionary
238        '''
239        raise NotImplementedError
240    
241class ScraperError(OliviaFinderException, BaseException):
242    '''
243    Exception raised when an error occurs while scraping
244    '''
245    
246
247
248
249    
class ScraperDataSource(olivia_finder.data_source.data_source.DataSource, abc.ABC):
 14class ScraperDataSource(DataSource, ABC):
 15
 16    """
 17    Base class for custom scraping implementations of software repositories as CRAN, PyPI, etc.
 18    This class is an abstract class, so it cannot be instantiated.
 19    The subclasses must implement the abstract methods.
 20    This class is a subclass of the DataSource class.
 21    This class implements the methods _build_urls, obtain_package_data and obtain_packages_data
 22    
 23    """
 24    
 25    def __init__(
 26        self, 
 27        request_handler: Optional[RequestHandler] = None,
 28    ):
 29        """
 30        Constructor of the class
 31
 32        Parameters
 33        ----------
 34        request_handler : RequestHandler, optional
 35            Request handler for making the requests, by default None
 36        """
 37
 38        # Initialize the request handler, use the default one if None is passed
 39        self.request_handler = request_handler if request_handler is not None else RequestHandler()
 40
 41        # Initialize the not_found list for storing the packages that are not found
 42        self.not_found = []
 43
 44        # Initialize the logger
 45        super().__init__()
 46
 47
 48
 49    def _build_jobs(self, package_names:List[str]) -> List[RequestJob]:
 50        '''
 51        Build the jobs for scraping the packages of the package_names list
 52
 53        Parameters
 54        ----------
 55        package_names : List[str]
 56            List of package names to scrape
 57
 58        Returns
 59        -------
 60        List[RequestJob]
 61            List of jobs to scrape
 62        '''
 63
 64        return [RequestJob(package_name, self._build_url(package_name)) for package_name in package_names]
 65
 66    def obtain_packages_data(
 67        self, 
 68        package_names: Optional[List[str]] = None,
 69        progress_bar: Optional[tqdm.tqdm] = None,
 70    ) -> Tuple[List[Dict], List[str]]:
 71        '''
 72        Scrape a list of packages from a package manager, if the package is not found, it is added to the not_found list
 73        Overrides the method of the DataSource class
 74
 75        Parameters
 76        ----------
 77        package_names : Optional[List[str]], optional
 78            List of package names to scrape, if None, the package names are obtained from the data source, by default None
 79        progress_bar : Optional[tqdm.tqdm], optional
 80            Progress bar, by default None
 81
 82        Raises
 83        ------
 84        ScraperError
 85            If the list of package names is None or empty and full_scrape is disabled
 86    
 87        Returns
 88        -------
 89        Tuple[List[dict], List[str]]
 90            Tuple with the list of packages data and the list of packages not found
 91            
 92        Examples
 93        --------
 94        >>> scraper = Scraper()
 95        >>> scraper.obtain_packages_data(['numpy', 'pandas'])
 96        '''
 97
 98        # If package_names is None, obtain the package names from the data source
 99        if package_names is None or not package_names:
100            self.logger.debug('Package names list is None or empty')
101            self.logger.debug('Obtaining package names from data source')
102            package_names = self.obtain_package_names()
103
104        else:
105            self.logger.debug('Using package names from param list')
106
107        self.logger.debug(f'Total packages to scrape: {len(package_names)}')
108        self.logger.debug('Building jobs')
109        jobs = self._build_jobs(package_names)
110
111        # Do the requests with the RequestHandler whitout parallelization
112        self.logger.debug('Making requests')
113        finnalized_jobs = self.request_handler.do_requests(
114            request_jobs=jobs,
115            progress_bar=progress_bar
116        )
117
118        # Initialize the list of packages
119        packages = []
120        packages_keys = {}
121
122        # Initialize the list of packages not found
123        not_found = []
124
125        for finnalized_job in finnalized_jobs:
126
127            # If the response is None, the package is not found
128            if finnalized_job.response is None:
129                not_found.append(finnalized_job.key)
130                continue
131
132            # Parse the source data and add it to the list
133            if finnalized_job.key not in packages_keys:
134                packages_keys[finnalized_job.key] = True
135                package = self._parser(finnalized_job.response)
136                if package is not None and package != {}:
137                    packages.append(package)
138
139        # Clear the variables to save memory
140        del jobs
141        del finnalized_jobs
142        del packages_keys
143        del package_names
144        gc.collect()
145
146        # Return the packages and the packages not found
147        return packages, not_found
148    
149    def obtain_package_data(self, package_name: str) -> Union[dict, None]:
150        """
151        Obtain the data of a package from the web page of the package manager
152
153        Parameters
154        ----------
155        package_name : str
156            Name of the package to be scraped
157
158        Returns
159        -------
160        dict
161            dictionary with the data of the package
162        None
163            If the package is not found
164        """
165
166        # Make the request, the response is the second element of the tuple returned by do_request
167        self.logger.debug(f'Scraping package {package_name}')
168        request_job = self.request_handler.do_request(
169            RequestJob(package_name, self._build_url(package_name))
170        )
171
172        # Parse the response
173        if request_job.response is None:
174            
175            self.logger.debug(
176                f'Package {package_name} not found\n' +
177                f'Adding {package_name} to the not found list'
178            )
179            self.not_found.append(package_name)
180            return None
181        else:
182            package_data = self._parser(request_job.response)
183
184        # If the package is found, log it and return the package data
185        self.logger.debug(f'Package {package_name} scraped successfully')
186        return package_data 
187
188    # Abstract methods
189    # ----------------
190    # This methods should be implemented in the child class
191
192    @abstractmethod
193    def obtain_package_names(self) -> List[str]:
194        """
195        Obtain the package names from the web page of the package manager
196        it must handle exceptions and return an empty list if the package names cannot be obtained
197        To be implemented by the child class
198        
199        Raises
200        ------
201        NotImplementedError
202            Bcause the method is not implemented in the base class
203        """
204        raise NotImplementedError
205
206    @abstractmethod
207    def _build_url(self, package_name: str) -> str:
208        '''
209        Build the url for scraping a package
210        This method must be implemented by the child class
211        
212        Parameters
213        ----------
214        package_name : str
215            Name of the package to scrape
216
217        Returns
218        -------
219        str
220            Url to request the package data
221        '''
222        
223        raise NotImplementedError
224
225    @abstractmethod
226    def _parser(self, response: requests.Response) -> dict:
227        '''
228        Parse the response of the package page
229        This method must be implemented by the child class
230        
231        Parameters
232        ----------
233        response : requests.Response
234            Response of the package page
235
236        Returns
237        -------
238        dict
239            Package data as a dictionary
240        '''
241        raise NotImplementedError

Base class for custom scraping implementations of software repositories as CRAN, PyPI, etc. This class is an abstract class, so it cannot be instantiated. The subclasses must implement the abstract methods. This class is a subclass of the DataSource class. This class implements the methods _build_urls, obtain_package_data and obtain_packages_data

ScraperDataSource( request_handler: Optional[olivia_finder.myrequests.request_handler.RequestHandler] = None)
25    def __init__(
26        self, 
27        request_handler: Optional[RequestHandler] = None,
28    ):
29        """
30        Constructor of the class
31
32        Parameters
33        ----------
34        request_handler : RequestHandler, optional
35            Request handler for making the requests, by default None
36        """
37
38        # Initialize the request handler, use the default one if None is passed
39        self.request_handler = request_handler if request_handler is not None else RequestHandler()
40
41        # Initialize the not_found list for storing the packages that are not found
42        self.not_found = []
43
44        # Initialize the logger
45        super().__init__()

Constructor of the class

Parameters
  • request_handler (RequestHandler, optional): Request handler for making the requests, by default None
def obtain_packages_data( self, package_names: Optional[List[str]] = None, progress_bar: Optional[tqdm.std.tqdm] = None) -> Tuple[List[Dict], List[str]]:
 66    def obtain_packages_data(
 67        self, 
 68        package_names: Optional[List[str]] = None,
 69        progress_bar: Optional[tqdm.tqdm] = None,
 70    ) -> Tuple[List[Dict], List[str]]:
 71        '''
 72        Scrape a list of packages from a package manager, if the package is not found, it is added to the not_found list
 73        Overrides the method of the DataSource class
 74
 75        Parameters
 76        ----------
 77        package_names : Optional[List[str]], optional
 78            List of package names to scrape, if None, the package names are obtained from the data source, by default None
 79        progress_bar : Optional[tqdm.tqdm], optional
 80            Progress bar, by default None
 81
 82        Raises
 83        ------
 84        ScraperError
 85            If the list of package names is None or empty and full_scrape is disabled
 86    
 87        Returns
 88        -------
 89        Tuple[List[dict], List[str]]
 90            Tuple with the list of packages data and the list of packages not found
 91            
 92        Examples
 93        --------
 94        >>> scraper = Scraper()
 95        >>> scraper.obtain_packages_data(['numpy', 'pandas'])
 96        '''
 97
 98        # If package_names is None, obtain the package names from the data source
 99        if package_names is None or not package_names:
100            self.logger.debug('Package names list is None or empty')
101            self.logger.debug('Obtaining package names from data source')
102            package_names = self.obtain_package_names()
103
104        else:
105            self.logger.debug('Using package names from param list')
106
107        self.logger.debug(f'Total packages to scrape: {len(package_names)}')
108        self.logger.debug('Building jobs')
109        jobs = self._build_jobs(package_names)
110
111        # Do the requests with the RequestHandler whitout parallelization
112        self.logger.debug('Making requests')
113        finnalized_jobs = self.request_handler.do_requests(
114            request_jobs=jobs,
115            progress_bar=progress_bar
116        )
117
118        # Initialize the list of packages
119        packages = []
120        packages_keys = {}
121
122        # Initialize the list of packages not found
123        not_found = []
124
125        for finnalized_job in finnalized_jobs:
126
127            # If the response is None, the package is not found
128            if finnalized_job.response is None:
129                not_found.append(finnalized_job.key)
130                continue
131
132            # Parse the source data and add it to the list
133            if finnalized_job.key not in packages_keys:
134                packages_keys[finnalized_job.key] = True
135                package = self._parser(finnalized_job.response)
136                if package is not None and package != {}:
137                    packages.append(package)
138
139        # Clear the variables to save memory
140        del jobs
141        del finnalized_jobs
142        del packages_keys
143        del package_names
144        gc.collect()
145
146        # Return the packages and the packages not found
147        return packages, not_found

Scrape a list of packages from a package manager, if the package is not found, it is added to the not_found list Overrides the method of the DataSource class

Parameters
  • package_names (Optional[List[str]], optional): List of package names to scrape, if None, the package names are obtained from the data source, by default None
  • progress_bar (Optional[tqdm.tqdm], optional): Progress bar, by default None
Raises
  • ScraperError: If the list of package names is None or empty and full_scrape is disabled
Returns
  • Tuple[List[dict], List[str]]: Tuple with the list of packages data and the list of packages not found
Examples
>>> scraper = Scraper()
>>> scraper.obtain_packages_data(['numpy', 'pandas'])
def obtain_package_data(self, package_name: str) -> Optional[dict]:
149    def obtain_package_data(self, package_name: str) -> Union[dict, None]:
150        """
151        Obtain the data of a package from the web page of the package manager
152
153        Parameters
154        ----------
155        package_name : str
156            Name of the package to be scraped
157
158        Returns
159        -------
160        dict
161            dictionary with the data of the package
162        None
163            If the package is not found
164        """
165
166        # Make the request, the response is the second element of the tuple returned by do_request
167        self.logger.debug(f'Scraping package {package_name}')
168        request_job = self.request_handler.do_request(
169            RequestJob(package_name, self._build_url(package_name))
170        )
171
172        # Parse the response
173        if request_job.response is None:
174            
175            self.logger.debug(
176                f'Package {package_name} not found\n' +
177                f'Adding {package_name} to the not found list'
178            )
179            self.not_found.append(package_name)
180            return None
181        else:
182            package_data = self._parser(request_job.response)
183
184        # If the package is found, log it and return the package data
185        self.logger.debug(f'Package {package_name} scraped successfully')
186        return package_data 

Obtain the data of a package from the web page of the package manager

Parameters
  • package_name (str): Name of the package to be scraped
Returns
  • dict: dictionary with the data of the package
  • None: If the package is not found
@abstractmethod
def obtain_package_names(self) -> List[str]:
192    @abstractmethod
193    def obtain_package_names(self) -> List[str]:
194        """
195        Obtain the package names from the web page of the package manager
196        it must handle exceptions and return an empty list if the package names cannot be obtained
197        To be implemented by the child class
198        
199        Raises
200        ------
201        NotImplementedError
202            Bcause the method is not implemented in the base class
203        """
204        raise NotImplementedError

Obtain the package names from the web page of the package manager it must handle exceptions and return an empty list if the package names cannot be obtained To be implemented by the child class

Raises
  • NotImplementedError: Bcause the method is not implemented in the base class
class ScraperError(olivia_finder.utilities.exception.OliviaFinderException, builtins.BaseException):
243class ScraperError(OliviaFinderException, BaseException):
244    '''
245    Exception raised when an error occurs while scraping
246    '''

Exception raised when an error occurs while scraping

Inherited Members
olivia_finder.utilities.exception.OliviaFinderException
OliviaFinderException
builtins.BaseException
with_traceback