olivia_finder.data_source.scraper_ds
1from __future__ import annotations 2from abc import ABC, abstractmethod 3from typing import Dict, List, Optional, Tuple, Union 4import requests 5import tqdm 6from .data_source import DataSource 7from ..myrequests.request_handler import RequestHandler 8from ..myrequests.job import RequestJob 9from ..utilities.exception import OliviaFinderException 10import gc 11 12class ScraperDataSource(DataSource, ABC): 13 14 """ 15 Base class for custom scraping implementations of software repositories as CRAN, PyPI, etc. 16 This class is an abstract class, so it cannot be instantiated. 17 The subclasses must implement the abstract methods. 18 This class is a subclass of the DataSource class. 19 This class implements the methods _build_urls, obtain_package_data and obtain_packages_data 20 21 """ 22 23 def __init__( 24 self, 25 request_handler: Optional[RequestHandler] = None, 26 ): 27 """ 28 Constructor of the class 29 30 Parameters 31 ---------- 32 request_handler : RequestHandler, optional 33 Request handler for making the requests, by default None 34 """ 35 36 # Initialize the request handler, use the default one if None is passed 37 self.request_handler = request_handler if request_handler is not None else RequestHandler() 38 39 # Initialize the not_found list for storing the packages that are not found 40 self.not_found = [] 41 42 # Initialize the logger 43 super().__init__() 44 45 46 47 def _build_jobs(self, package_names:List[str]) -> List[RequestJob]: 48 ''' 49 Build the jobs for scraping the packages of the package_names list 50 51 Parameters 52 ---------- 53 package_names : List[str] 54 List of package names to scrape 55 56 Returns 57 ------- 58 List[RequestJob] 59 List of jobs to scrape 60 ''' 61 62 return [RequestJob(package_name, self._build_url(package_name)) for package_name in package_names] 63 64 def obtain_packages_data( 65 self, 66 package_names: Optional[List[str]] = None, 67 progress_bar: Optional[tqdm.tqdm] = None, 68 ) -> Tuple[List[Dict], List[str]]: 69 ''' 70 Scrape a list of packages from a package manager, if the package is not found, it is added to the not_found list 71 Overrides the method of the DataSource class 72 73 Parameters 74 ---------- 75 package_names : Optional[List[str]], optional 76 List of package names to scrape, if None, the package names are obtained from the data source, by default None 77 progress_bar : Optional[tqdm.tqdm], optional 78 Progress bar, by default None 79 80 Raises 81 ------ 82 ScraperError 83 If the list of package names is None or empty and full_scrape is disabled 84 85 Returns 86 ------- 87 Tuple[List[dict], List[str]] 88 Tuple with the list of packages data and the list of packages not found 89 90 Examples 91 -------- 92 >>> scraper = Scraper() 93 >>> scraper.obtain_packages_data(['numpy', 'pandas']) 94 ''' 95 96 # If package_names is None, obtain the package names from the data source 97 if package_names is None or not package_names: 98 self.logger.debug('Package names list is None or empty') 99 self.logger.debug('Obtaining package names from data source') 100 package_names = self.obtain_package_names() 101 102 else: 103 self.logger.debug('Using package names from param list') 104 105 self.logger.debug(f'Total packages to scrape: {len(package_names)}') 106 self.logger.debug('Building jobs') 107 jobs = self._build_jobs(package_names) 108 109 # Do the requests with the RequestHandler whitout parallelization 110 self.logger.debug('Making requests') 111 finnalized_jobs = self.request_handler.do_requests( 112 request_jobs=jobs, 113 progress_bar=progress_bar 114 ) 115 116 # Initialize the list of packages 117 packages = [] 118 packages_keys = {} 119 120 # Initialize the list of packages not found 121 not_found = [] 122 123 for finnalized_job in finnalized_jobs: 124 125 # If the response is None, the package is not found 126 if finnalized_job.response is None: 127 not_found.append(finnalized_job.key) 128 continue 129 130 # Parse the source data and add it to the list 131 if finnalized_job.key not in packages_keys: 132 packages_keys[finnalized_job.key] = True 133 package = self._parser(finnalized_job.response) 134 if package is not None and package != {}: 135 packages.append(package) 136 137 # Clear the variables to save memory 138 del jobs 139 del finnalized_jobs 140 del packages_keys 141 del package_names 142 gc.collect() 143 144 # Return the packages and the packages not found 145 return packages, not_found 146 147 def obtain_package_data(self, package_name: str) -> Union[dict, None]: 148 """ 149 Obtain the data of a package from the web page of the package manager 150 151 Parameters 152 ---------- 153 package_name : str 154 Name of the package to be scraped 155 156 Returns 157 ------- 158 dict 159 dictionary with the data of the package 160 None 161 If the package is not found 162 """ 163 164 # Make the request, the response is the second element of the tuple returned by do_request 165 self.logger.debug(f'Scraping package {package_name}') 166 request_job = self.request_handler.do_request( 167 RequestJob(package_name, self._build_url(package_name)) 168 ) 169 170 # Parse the response 171 if request_job.response is None: 172 173 self.logger.debug( 174 f'Package {package_name} not found\n' + 175 f'Adding {package_name} to the not found list' 176 ) 177 self.not_found.append(package_name) 178 return None 179 else: 180 package_data = self._parser(request_job.response) 181 182 # If the package is found, log it and return the package data 183 self.logger.debug(f'Package {package_name} scraped successfully') 184 return package_data 185 186 # Abstract methods 187 # ---------------- 188 # This methods should be implemented in the child class 189 190 @abstractmethod 191 def obtain_package_names(self) -> List[str]: 192 """ 193 Obtain the package names from the web page of the package manager 194 it must handle exceptions and return an empty list if the package names cannot be obtained 195 To be implemented by the child class 196 197 Raises 198 ------ 199 NotImplementedError 200 Bcause the method is not implemented in the base class 201 """ 202 raise NotImplementedError 203 204 @abstractmethod 205 def _build_url(self, package_name: str) -> str: 206 ''' 207 Build the url for scraping a package 208 This method must be implemented by the child class 209 210 Parameters 211 ---------- 212 package_name : str 213 Name of the package to scrape 214 215 Returns 216 ------- 217 str 218 Url to request the package data 219 ''' 220 221 raise NotImplementedError 222 223 @abstractmethod 224 def _parser(self, response: requests.Response) -> dict: 225 ''' 226 Parse the response of the package page 227 This method must be implemented by the child class 228 229 Parameters 230 ---------- 231 response : requests.Response 232 Response of the package page 233 234 Returns 235 ------- 236 dict 237 Package data as a dictionary 238 ''' 239 raise NotImplementedError 240 241class ScraperError(OliviaFinderException, BaseException): 242 ''' 243 Exception raised when an error occurs while scraping 244 ''' 245 246 247 248 249
14class ScraperDataSource(DataSource, ABC): 15 16 """ 17 Base class for custom scraping implementations of software repositories as CRAN, PyPI, etc. 18 This class is an abstract class, so it cannot be instantiated. 19 The subclasses must implement the abstract methods. 20 This class is a subclass of the DataSource class. 21 This class implements the methods _build_urls, obtain_package_data and obtain_packages_data 22 23 """ 24 25 def __init__( 26 self, 27 request_handler: Optional[RequestHandler] = None, 28 ): 29 """ 30 Constructor of the class 31 32 Parameters 33 ---------- 34 request_handler : RequestHandler, optional 35 Request handler for making the requests, by default None 36 """ 37 38 # Initialize the request handler, use the default one if None is passed 39 self.request_handler = request_handler if request_handler is not None else RequestHandler() 40 41 # Initialize the not_found list for storing the packages that are not found 42 self.not_found = [] 43 44 # Initialize the logger 45 super().__init__() 46 47 48 49 def _build_jobs(self, package_names:List[str]) -> List[RequestJob]: 50 ''' 51 Build the jobs for scraping the packages of the package_names list 52 53 Parameters 54 ---------- 55 package_names : List[str] 56 List of package names to scrape 57 58 Returns 59 ------- 60 List[RequestJob] 61 List of jobs to scrape 62 ''' 63 64 return [RequestJob(package_name, self._build_url(package_name)) for package_name in package_names] 65 66 def obtain_packages_data( 67 self, 68 package_names: Optional[List[str]] = None, 69 progress_bar: Optional[tqdm.tqdm] = None, 70 ) -> Tuple[List[Dict], List[str]]: 71 ''' 72 Scrape a list of packages from a package manager, if the package is not found, it is added to the not_found list 73 Overrides the method of the DataSource class 74 75 Parameters 76 ---------- 77 package_names : Optional[List[str]], optional 78 List of package names to scrape, if None, the package names are obtained from the data source, by default None 79 progress_bar : Optional[tqdm.tqdm], optional 80 Progress bar, by default None 81 82 Raises 83 ------ 84 ScraperError 85 If the list of package names is None or empty and full_scrape is disabled 86 87 Returns 88 ------- 89 Tuple[List[dict], List[str]] 90 Tuple with the list of packages data and the list of packages not found 91 92 Examples 93 -------- 94 >>> scraper = Scraper() 95 >>> scraper.obtain_packages_data(['numpy', 'pandas']) 96 ''' 97 98 # If package_names is None, obtain the package names from the data source 99 if package_names is None or not package_names: 100 self.logger.debug('Package names list is None or empty') 101 self.logger.debug('Obtaining package names from data source') 102 package_names = self.obtain_package_names() 103 104 else: 105 self.logger.debug('Using package names from param list') 106 107 self.logger.debug(f'Total packages to scrape: {len(package_names)}') 108 self.logger.debug('Building jobs') 109 jobs = self._build_jobs(package_names) 110 111 # Do the requests with the RequestHandler whitout parallelization 112 self.logger.debug('Making requests') 113 finnalized_jobs = self.request_handler.do_requests( 114 request_jobs=jobs, 115 progress_bar=progress_bar 116 ) 117 118 # Initialize the list of packages 119 packages = [] 120 packages_keys = {} 121 122 # Initialize the list of packages not found 123 not_found = [] 124 125 for finnalized_job in finnalized_jobs: 126 127 # If the response is None, the package is not found 128 if finnalized_job.response is None: 129 not_found.append(finnalized_job.key) 130 continue 131 132 # Parse the source data and add it to the list 133 if finnalized_job.key not in packages_keys: 134 packages_keys[finnalized_job.key] = True 135 package = self._parser(finnalized_job.response) 136 if package is not None and package != {}: 137 packages.append(package) 138 139 # Clear the variables to save memory 140 del jobs 141 del finnalized_jobs 142 del packages_keys 143 del package_names 144 gc.collect() 145 146 # Return the packages and the packages not found 147 return packages, not_found 148 149 def obtain_package_data(self, package_name: str) -> Union[dict, None]: 150 """ 151 Obtain the data of a package from the web page of the package manager 152 153 Parameters 154 ---------- 155 package_name : str 156 Name of the package to be scraped 157 158 Returns 159 ------- 160 dict 161 dictionary with the data of the package 162 None 163 If the package is not found 164 """ 165 166 # Make the request, the response is the second element of the tuple returned by do_request 167 self.logger.debug(f'Scraping package {package_name}') 168 request_job = self.request_handler.do_request( 169 RequestJob(package_name, self._build_url(package_name)) 170 ) 171 172 # Parse the response 173 if request_job.response is None: 174 175 self.logger.debug( 176 f'Package {package_name} not found\n' + 177 f'Adding {package_name} to the not found list' 178 ) 179 self.not_found.append(package_name) 180 return None 181 else: 182 package_data = self._parser(request_job.response) 183 184 # If the package is found, log it and return the package data 185 self.logger.debug(f'Package {package_name} scraped successfully') 186 return package_data 187 188 # Abstract methods 189 # ---------------- 190 # This methods should be implemented in the child class 191 192 @abstractmethod 193 def obtain_package_names(self) -> List[str]: 194 """ 195 Obtain the package names from the web page of the package manager 196 it must handle exceptions and return an empty list if the package names cannot be obtained 197 To be implemented by the child class 198 199 Raises 200 ------ 201 NotImplementedError 202 Bcause the method is not implemented in the base class 203 """ 204 raise NotImplementedError 205 206 @abstractmethod 207 def _build_url(self, package_name: str) -> str: 208 ''' 209 Build the url for scraping a package 210 This method must be implemented by the child class 211 212 Parameters 213 ---------- 214 package_name : str 215 Name of the package to scrape 216 217 Returns 218 ------- 219 str 220 Url to request the package data 221 ''' 222 223 raise NotImplementedError 224 225 @abstractmethod 226 def _parser(self, response: requests.Response) -> dict: 227 ''' 228 Parse the response of the package page 229 This method must be implemented by the child class 230 231 Parameters 232 ---------- 233 response : requests.Response 234 Response of the package page 235 236 Returns 237 ------- 238 dict 239 Package data as a dictionary 240 ''' 241 raise NotImplementedError
Base class for custom scraping implementations of software repositories as CRAN, PyPI, etc. This class is an abstract class, so it cannot be instantiated. The subclasses must implement the abstract methods. This class is a subclass of the DataSource class. This class implements the methods _build_urls, obtain_package_data and obtain_packages_data
25 def __init__( 26 self, 27 request_handler: Optional[RequestHandler] = None, 28 ): 29 """ 30 Constructor of the class 31 32 Parameters 33 ---------- 34 request_handler : RequestHandler, optional 35 Request handler for making the requests, by default None 36 """ 37 38 # Initialize the request handler, use the default one if None is passed 39 self.request_handler = request_handler if request_handler is not None else RequestHandler() 40 41 # Initialize the not_found list for storing the packages that are not found 42 self.not_found = [] 43 44 # Initialize the logger 45 super().__init__()
Constructor of the class
Parameters
- request_handler (RequestHandler, optional): Request handler for making the requests, by default None
66 def obtain_packages_data( 67 self, 68 package_names: Optional[List[str]] = None, 69 progress_bar: Optional[tqdm.tqdm] = None, 70 ) -> Tuple[List[Dict], List[str]]: 71 ''' 72 Scrape a list of packages from a package manager, if the package is not found, it is added to the not_found list 73 Overrides the method of the DataSource class 74 75 Parameters 76 ---------- 77 package_names : Optional[List[str]], optional 78 List of package names to scrape, if None, the package names are obtained from the data source, by default None 79 progress_bar : Optional[tqdm.tqdm], optional 80 Progress bar, by default None 81 82 Raises 83 ------ 84 ScraperError 85 If the list of package names is None or empty and full_scrape is disabled 86 87 Returns 88 ------- 89 Tuple[List[dict], List[str]] 90 Tuple with the list of packages data and the list of packages not found 91 92 Examples 93 -------- 94 >>> scraper = Scraper() 95 >>> scraper.obtain_packages_data(['numpy', 'pandas']) 96 ''' 97 98 # If package_names is None, obtain the package names from the data source 99 if package_names is None or not package_names: 100 self.logger.debug('Package names list is None or empty') 101 self.logger.debug('Obtaining package names from data source') 102 package_names = self.obtain_package_names() 103 104 else: 105 self.logger.debug('Using package names from param list') 106 107 self.logger.debug(f'Total packages to scrape: {len(package_names)}') 108 self.logger.debug('Building jobs') 109 jobs = self._build_jobs(package_names) 110 111 # Do the requests with the RequestHandler whitout parallelization 112 self.logger.debug('Making requests') 113 finnalized_jobs = self.request_handler.do_requests( 114 request_jobs=jobs, 115 progress_bar=progress_bar 116 ) 117 118 # Initialize the list of packages 119 packages = [] 120 packages_keys = {} 121 122 # Initialize the list of packages not found 123 not_found = [] 124 125 for finnalized_job in finnalized_jobs: 126 127 # If the response is None, the package is not found 128 if finnalized_job.response is None: 129 not_found.append(finnalized_job.key) 130 continue 131 132 # Parse the source data and add it to the list 133 if finnalized_job.key not in packages_keys: 134 packages_keys[finnalized_job.key] = True 135 package = self._parser(finnalized_job.response) 136 if package is not None and package != {}: 137 packages.append(package) 138 139 # Clear the variables to save memory 140 del jobs 141 del finnalized_jobs 142 del packages_keys 143 del package_names 144 gc.collect() 145 146 # Return the packages and the packages not found 147 return packages, not_found
Scrape a list of packages from a package manager, if the package is not found, it is added to the not_found list Overrides the method of the DataSource class
Parameters
- package_names (Optional[List[str]], optional): List of package names to scrape, if None, the package names are obtained from the data source, by default None
- progress_bar (Optional[tqdm.tqdm], optional): Progress bar, by default None
Raises
- ScraperError: If the list of package names is None or empty and full_scrape is disabled
Returns
- Tuple[List[dict], List[str]]: Tuple with the list of packages data and the list of packages not found
Examples
>>> scraper = Scraper()
>>> scraper.obtain_packages_data(['numpy', 'pandas'])
149 def obtain_package_data(self, package_name: str) -> Union[dict, None]: 150 """ 151 Obtain the data of a package from the web page of the package manager 152 153 Parameters 154 ---------- 155 package_name : str 156 Name of the package to be scraped 157 158 Returns 159 ------- 160 dict 161 dictionary with the data of the package 162 None 163 If the package is not found 164 """ 165 166 # Make the request, the response is the second element of the tuple returned by do_request 167 self.logger.debug(f'Scraping package {package_name}') 168 request_job = self.request_handler.do_request( 169 RequestJob(package_name, self._build_url(package_name)) 170 ) 171 172 # Parse the response 173 if request_job.response is None: 174 175 self.logger.debug( 176 f'Package {package_name} not found\n' + 177 f'Adding {package_name} to the not found list' 178 ) 179 self.not_found.append(package_name) 180 return None 181 else: 182 package_data = self._parser(request_job.response) 183 184 # If the package is found, log it and return the package data 185 self.logger.debug(f'Package {package_name} scraped successfully') 186 return package_data
Obtain the data of a package from the web page of the package manager
Parameters
- package_name (str): Name of the package to be scraped
Returns
- dict: dictionary with the data of the package
- None: If the package is not found
192 @abstractmethod 193 def obtain_package_names(self) -> List[str]: 194 """ 195 Obtain the package names from the web page of the package manager 196 it must handle exceptions and return an empty list if the package names cannot be obtained 197 To be implemented by the child class 198 199 Raises 200 ------ 201 NotImplementedError 202 Bcause the method is not implemented in the base class 203 """ 204 raise NotImplementedError
Obtain the package names from the web page of the package manager it must handle exceptions and return an empty list if the package names cannot be obtained To be implemented by the child class
Raises
- NotImplementedError: Bcause the method is not implemented in the base class
243class ScraperError(OliviaFinderException, BaseException): 244 ''' 245 Exception raised when an error occurs while scraping 246 '''
Exception raised when an error occurs while scraping
Inherited Members
- builtins.BaseException
- with_traceback