olivia_finder.data_source.repository_scrapers.npm
1import os 2from typing import List, Optional 3import requests 4from typing_extensions import override 5from tqdm import tqdm 6 7from ..scraper_ds import ScraperDataSource 8from ...utilities.config import Configuration 9from ...myrequests.request_handler import RequestHandler 10from ...myrequests.job import RequestJob 11 12 13class NpmScraper(ScraperDataSource): 14 ''' 15 Class that scrapes the NPM website to obtain information about JavaScript packages 16 17 Attributes 18 ---------- 19 NPM_PACKAGE_REGISTRY_URL : str 20 URL of the page that contains the list of packages 21 NPM_PACKAGE_LIST_URL : str 22 URL of the page that contains the data of a package 23 NPM_REPO_URL : str 24 URL of the page that contains the data of a package 25 26 Parameters 27 ---------- 28 request_handler : RequestHandler = None 29 Request handler for the scraper, if None, it will be initialized with a generic RequestHandler 30 ''' 31 32 # Constants 33 NPM_PACKAGE_REGISTRY_URL: str = 'https://registry.npmjs.org/' 34 NPM_PACKAGE_LIST_URL: str = 'https://skimdb.npmjs.com/registry/_all_docs' 35 NPM_REPO_URL: str = 'https://www.npmjs.com/package' 36 37 def __init__( 38 self, 39 request_handler: Optional[RequestHandler] = None, 40 ): 41 ''' 42 Constructor of the class 43 ''' 44 45 self.chunks_folder = None 46 ''' 47 Folder where the chunks will be saved, 48 This is because the NPM registry is too big to be downloaded in one go 49 ''' 50 51 super().__init__(request_handler) 52 53 @override 54 def obtain_package_names( 55 self, 56 page_size: int = 100, 57 show_progress_bar: bool = True, 58 save_chunks: bool = False, 59 ) -> List[dict]: 60 61 ''' 62 Function to obtain the names of the packages in the NPM repository 63 64 Parameters 65 ---------- 66 page_size : int = 100 67 Number of packages to be requested in each page 68 show_progress_bar : bool = True 69 Flag to show the progress bar 70 save_chunks : bool = False 71 Flag to save the chunks of the registry in the chunks folder 72 73 Returns 74 ------- 75 List[dict] 76 List of dictionaries with the name and version of the packages 77 78 Examples 79 -------- 80 >>> scraper = NpmScraper() 81 >>> scraper.obtain_package_names() 82 ''' 83 # Get the total number of packages 84 response = requests.get(self.NPM_PACKAGE_REGISTRY_URL, timeout=30) 85 total_packages = response.json()['doc_count'] 86 87 # Calculate the number of pages (chunks) 88 num_pages = (total_packages // page_size) + 1 89 90 self.logger.debug(f'Total number of packages: {total_packages}') 91 self.logger.debug(f'Number of pages: {num_pages}') 92 93 # Initialize the progress bar if is set 94 progress_bar = tqdm(total=num_pages) if show_progress_bar else None 95 96 # Initialize the chunks folder if is set 97 if save_chunks: 98 self.logger.debug(f'Saving chunks at: {self.chunks_folder}') 99 self._init_chunks_folder() 100 101 # Obtain the names of the packages requesting the pages 102 pages = [] 103 last_key = None 104 for i in range(num_pages): 105 106 # Download the page 107 # Handle disconnects 108 page = None 109 while page is None: 110 try: 111 page = self._download_page(last_key, page_size) 112 except requests.exceptions.ConnectionError: 113 self.logger.debug(f'Connection error in page {i} of {num_pages}') 114 self.logger.debug(f'Last key: {last_key}') 115 self.logger.debug('Retrying...') 116 117 # check if the page is empty 118 if len(page) == 0: # type: ignore 119 self.logger.debug(f'Empty page {i} of {num_pages}') 120 self.logger.debug(f'Last key: {last_key}') 121 page = None 122 123 pages.append(page) 124 self.logger.debug(f'Downloaded page {i} of {num_pages}') 125 126 # get the last key of the page for the next iter 127 last_key = page[-1]['id'] 128 129 # Save chunk if is set 130 if save_chunks: 131 self.logger.debug(f'Saving chunk {i} of {num_pages}') 132 with open(f'{self.chunks_folder}/chunk_{i}.json', 'w') as f: 133 f.write(str(page)) 134 135 # Update progress bar if is set 136 progress_bar.update(1) if progress_bar is not None else None 137 138 package_names = [row['id'] for page in pages for row in page] 139 self.logger.info(f'Obtained {len(package_names)} packages from {self.NPM_PACKAGE_LIST_URL}') 140 return package_names 141 142 def _init_chunks_folder(self): 143 ''' 144 Function to initialize the chunks folder, where the chunks will be saved 145 Loads the path from the configuration file 146 ''' 147 self.chunks_folder = f'{Configuration().get_key("folders", "working_dir")}/npm_package_names_chunks' 148 os.makedirs(self.chunks_folder, exist_ok=True) 149 150 def _download_page( 151 self, 152 start_key: Optional[str] = None, 153 size: int = 1000, 154 retries: int = 5 155 )-> List[dict]: 156 ''' 157 Function to download a page of documents from the NPM repository and return a list of dictionaries with the name and version of the packages 158 159 Parameters 160 ---------- 161 start_key : str = None 162 Key to start the download 163 size : int = None 164 Size of the page to download 165 retries : int = None 166 Number of retries to download the page 167 168 Returns 169 ------- 170 List[dict] 171 List of dictionaries with the name and version of the packages 172 ''' 173 174 # Fix for the first page 175 if start_key is None: 176 params = {'limit': size} 177 else: 178 encode_start_key = "\"" + start_key + "\"" 179 params = {'startkey': encode_start_key, 'limit': size} 180 181 # Download the page 182 job = self.request_handler.do_request( 183 RequestJob( 184 key='npm_download_page', 185 url=self.NPM_PACKAGE_LIST_URL, 186 params=params, 187 ) 188 ) 189 190 # If the response is None, return an empty list 191 if job.response is None: 192 self.logger.debug(f'None response at __download_page: url={self.NPM_PACKAGE_LIST_URL}') 193 return [] 194 195 # If the response returns an error, return an empty list 196 try: 197 data = job.response.json() 198 199 except Exception as e: 200 201 msg = f'EXCEPTION at __download_page: url={self.NPM_PACKAGE_LIST_URL}\n' 202 msg += f'Response: {job.response.text}\n' 203 msg += f'Params: {params}\n' 204 msg += f'Retrying, times left: {retries}\n' 205 self.logger.debug(msg) 206 207 return self._download_page(start_key, size, retries-1) 208 209 if data.keys() == {'error', 'reason'}: 210 return self._download_page(start_key, size, retries-1) 211 else: 212 # Fix of selecting by last key 213 return data['rows'][1:] 214 215 @override 216 def _build_url(self, package_name: str): 217 ''' 218 Function to build the URL of the package 219 220 Parameters 221 ---------- 222 package_name : str 223 Name of the package 224 225 Returns 226 ------- 227 str 228 URL of the package 229 ''' 230 return f'{self.NPM_PACKAGE_REGISTRY_URL}/{package_name}' 231 232 @override 233 def _parser(self, response: requests.Response) -> dict: 234 ''' 235 Parse the response of the request 236 237 Parameters 238 ---------- 239 response : requests.Response 240 Response of the request 241 242 Returns 243 ------- 244 dict 245 dictionary with the parsed data 246 247 Examples 248 -------- 249 >>> response = requests.get('https://registry.npmjs.org/express') 250 >>> parser(response) 251 { 252 'name': 'express', 253 'version': '4.17.1', 254 'dependencies': [ 255 {'name': 'accepts', 'version': '1.3.7'}, 256 {'name': 'array-flatten', 'version': '1.1.1'}, 257 {'name': 'body-parser', 'version': '1.19.0'}, 258 {'name': 'content-disposition', 'version': '0.5.3'}, 259 {'name': 'content-type', 'version': '1.0.4'} 260 ], 261 'url': 'https://www.npmjs.com/package/express' 262 } 263 ''' 264 265 response_json = response.json() 266 267 # Check if the package exists 268 if 'error' in response_json: 269 return {} 270 271 # Get the package name and version 272 try: 273 package_name = response_json['_id'] 274 except KeyError: 275 # If the package does not have a name, return an empty dict 276 return {} 277 278 try: 279 package_version = response_json['dist-tags']['latest'] 280 except KeyError: 281 package_version = None 282 283 # get the dependencies 284 try: 285 dependencies = response_json['versions'][package_version]['dependencies'] 286 if dependencies is None: 287 dependencies = {} 288 except KeyError: 289 dependencies = {} 290 291 dep_list = [ 292 {'name': key, 'version': value} for key, value in dependencies.items() 293 ] 294 295 # get dev dependencies 296 try: 297 dev_dependencies = response_json['versions'][package_version]['devDependencies'] 298 if dev_dependencies is None: 299 dev_dependencies = {} 300 except KeyError: 301 dev_dependencies = {} 302 303 dep_list += [ 304 {'name': key, 'version': value} for key, value in dev_dependencies.items() 305 ] 306 307 return { 308 'name': package_name, 309 'version': package_version, 310 'dependencies': dep_list, 311 'url': f'{self.NPM_REPO_URL}/{package_name}' 312 } 313 314 315
14class NpmScraper(ScraperDataSource): 15 ''' 16 Class that scrapes the NPM website to obtain information about JavaScript packages 17 18 Attributes 19 ---------- 20 NPM_PACKAGE_REGISTRY_URL : str 21 URL of the page that contains the list of packages 22 NPM_PACKAGE_LIST_URL : str 23 URL of the page that contains the data of a package 24 NPM_REPO_URL : str 25 URL of the page that contains the data of a package 26 27 Parameters 28 ---------- 29 request_handler : RequestHandler = None 30 Request handler for the scraper, if None, it will be initialized with a generic RequestHandler 31 ''' 32 33 # Constants 34 NPM_PACKAGE_REGISTRY_URL: str = 'https://registry.npmjs.org/' 35 NPM_PACKAGE_LIST_URL: str = 'https://skimdb.npmjs.com/registry/_all_docs' 36 NPM_REPO_URL: str = 'https://www.npmjs.com/package' 37 38 def __init__( 39 self, 40 request_handler: Optional[RequestHandler] = None, 41 ): 42 ''' 43 Constructor of the class 44 ''' 45 46 self.chunks_folder = None 47 ''' 48 Folder where the chunks will be saved, 49 This is because the NPM registry is too big to be downloaded in one go 50 ''' 51 52 super().__init__(request_handler) 53 54 @override 55 def obtain_package_names( 56 self, 57 page_size: int = 100, 58 show_progress_bar: bool = True, 59 save_chunks: bool = False, 60 ) -> List[dict]: 61 62 ''' 63 Function to obtain the names of the packages in the NPM repository 64 65 Parameters 66 ---------- 67 page_size : int = 100 68 Number of packages to be requested in each page 69 show_progress_bar : bool = True 70 Flag to show the progress bar 71 save_chunks : bool = False 72 Flag to save the chunks of the registry in the chunks folder 73 74 Returns 75 ------- 76 List[dict] 77 List of dictionaries with the name and version of the packages 78 79 Examples 80 -------- 81 >>> scraper = NpmScraper() 82 >>> scraper.obtain_package_names() 83 ''' 84 # Get the total number of packages 85 response = requests.get(self.NPM_PACKAGE_REGISTRY_URL, timeout=30) 86 total_packages = response.json()['doc_count'] 87 88 # Calculate the number of pages (chunks) 89 num_pages = (total_packages // page_size) + 1 90 91 self.logger.debug(f'Total number of packages: {total_packages}') 92 self.logger.debug(f'Number of pages: {num_pages}') 93 94 # Initialize the progress bar if is set 95 progress_bar = tqdm(total=num_pages) if show_progress_bar else None 96 97 # Initialize the chunks folder if is set 98 if save_chunks: 99 self.logger.debug(f'Saving chunks at: {self.chunks_folder}') 100 self._init_chunks_folder() 101 102 # Obtain the names of the packages requesting the pages 103 pages = [] 104 last_key = None 105 for i in range(num_pages): 106 107 # Download the page 108 # Handle disconnects 109 page = None 110 while page is None: 111 try: 112 page = self._download_page(last_key, page_size) 113 except requests.exceptions.ConnectionError: 114 self.logger.debug(f'Connection error in page {i} of {num_pages}') 115 self.logger.debug(f'Last key: {last_key}') 116 self.logger.debug('Retrying...') 117 118 # check if the page is empty 119 if len(page) == 0: # type: ignore 120 self.logger.debug(f'Empty page {i} of {num_pages}') 121 self.logger.debug(f'Last key: {last_key}') 122 page = None 123 124 pages.append(page) 125 self.logger.debug(f'Downloaded page {i} of {num_pages}') 126 127 # get the last key of the page for the next iter 128 last_key = page[-1]['id'] 129 130 # Save chunk if is set 131 if save_chunks: 132 self.logger.debug(f'Saving chunk {i} of {num_pages}') 133 with open(f'{self.chunks_folder}/chunk_{i}.json', 'w') as f: 134 f.write(str(page)) 135 136 # Update progress bar if is set 137 progress_bar.update(1) if progress_bar is not None else None 138 139 package_names = [row['id'] for page in pages for row in page] 140 self.logger.info(f'Obtained {len(package_names)} packages from {self.NPM_PACKAGE_LIST_URL}') 141 return package_names 142 143 def _init_chunks_folder(self): 144 ''' 145 Function to initialize the chunks folder, where the chunks will be saved 146 Loads the path from the configuration file 147 ''' 148 self.chunks_folder = f'{Configuration().get_key("folders", "working_dir")}/npm_package_names_chunks' 149 os.makedirs(self.chunks_folder, exist_ok=True) 150 151 def _download_page( 152 self, 153 start_key: Optional[str] = None, 154 size: int = 1000, 155 retries: int = 5 156 )-> List[dict]: 157 ''' 158 Function to download a page of documents from the NPM repository and return a list of dictionaries with the name and version of the packages 159 160 Parameters 161 ---------- 162 start_key : str = None 163 Key to start the download 164 size : int = None 165 Size of the page to download 166 retries : int = None 167 Number of retries to download the page 168 169 Returns 170 ------- 171 List[dict] 172 List of dictionaries with the name and version of the packages 173 ''' 174 175 # Fix for the first page 176 if start_key is None: 177 params = {'limit': size} 178 else: 179 encode_start_key = "\"" + start_key + "\"" 180 params = {'startkey': encode_start_key, 'limit': size} 181 182 # Download the page 183 job = self.request_handler.do_request( 184 RequestJob( 185 key='npm_download_page', 186 url=self.NPM_PACKAGE_LIST_URL, 187 params=params, 188 ) 189 ) 190 191 # If the response is None, return an empty list 192 if job.response is None: 193 self.logger.debug(f'None response at __download_page: url={self.NPM_PACKAGE_LIST_URL}') 194 return [] 195 196 # If the response returns an error, return an empty list 197 try: 198 data = job.response.json() 199 200 except Exception as e: 201 202 msg = f'EXCEPTION at __download_page: url={self.NPM_PACKAGE_LIST_URL}\n' 203 msg += f'Response: {job.response.text}\n' 204 msg += f'Params: {params}\n' 205 msg += f'Retrying, times left: {retries}\n' 206 self.logger.debug(msg) 207 208 return self._download_page(start_key, size, retries-1) 209 210 if data.keys() == {'error', 'reason'}: 211 return self._download_page(start_key, size, retries-1) 212 else: 213 # Fix of selecting by last key 214 return data['rows'][1:] 215 216 @override 217 def _build_url(self, package_name: str): 218 ''' 219 Function to build the URL of the package 220 221 Parameters 222 ---------- 223 package_name : str 224 Name of the package 225 226 Returns 227 ------- 228 str 229 URL of the package 230 ''' 231 return f'{self.NPM_PACKAGE_REGISTRY_URL}/{package_name}' 232 233 @override 234 def _parser(self, response: requests.Response) -> dict: 235 ''' 236 Parse the response of the request 237 238 Parameters 239 ---------- 240 response : requests.Response 241 Response of the request 242 243 Returns 244 ------- 245 dict 246 dictionary with the parsed data 247 248 Examples 249 -------- 250 >>> response = requests.get('https://registry.npmjs.org/express') 251 >>> parser(response) 252 { 253 'name': 'express', 254 'version': '4.17.1', 255 'dependencies': [ 256 {'name': 'accepts', 'version': '1.3.7'}, 257 {'name': 'array-flatten', 'version': '1.1.1'}, 258 {'name': 'body-parser', 'version': '1.19.0'}, 259 {'name': 'content-disposition', 'version': '0.5.3'}, 260 {'name': 'content-type', 'version': '1.0.4'} 261 ], 262 'url': 'https://www.npmjs.com/package/express' 263 } 264 ''' 265 266 response_json = response.json() 267 268 # Check if the package exists 269 if 'error' in response_json: 270 return {} 271 272 # Get the package name and version 273 try: 274 package_name = response_json['_id'] 275 except KeyError: 276 # If the package does not have a name, return an empty dict 277 return {} 278 279 try: 280 package_version = response_json['dist-tags']['latest'] 281 except KeyError: 282 package_version = None 283 284 # get the dependencies 285 try: 286 dependencies = response_json['versions'][package_version]['dependencies'] 287 if dependencies is None: 288 dependencies = {} 289 except KeyError: 290 dependencies = {} 291 292 dep_list = [ 293 {'name': key, 'version': value} for key, value in dependencies.items() 294 ] 295 296 # get dev dependencies 297 try: 298 dev_dependencies = response_json['versions'][package_version]['devDependencies'] 299 if dev_dependencies is None: 300 dev_dependencies = {} 301 except KeyError: 302 dev_dependencies = {} 303 304 dep_list += [ 305 {'name': key, 'version': value} for key, value in dev_dependencies.items() 306 ] 307 308 return { 309 'name': package_name, 310 'version': package_version, 311 'dependencies': dep_list, 312 'url': f'{self.NPM_REPO_URL}/{package_name}' 313 }
Class that scrapes the NPM website to obtain information about JavaScript packages
Attributes
- NPM_PACKAGE_REGISTRY_URL (str): URL of the page that contains the list of packages
- NPM_PACKAGE_LIST_URL (str): URL of the page that contains the data of a package
- NPM_REPO_URL (str): URL of the page that contains the data of a package
Parameters
- request_handler (RequestHandler = None): Request handler for the scraper, if None, it will be initialized with a generic RequestHandler
NpmScraper( request_handler: Optional[olivia_finder.myrequests.request_handler.RequestHandler] = None)
38 def __init__( 39 self, 40 request_handler: Optional[RequestHandler] = None, 41 ): 42 ''' 43 Constructor of the class 44 ''' 45 46 self.chunks_folder = None 47 ''' 48 Folder where the chunks will be saved, 49 This is because the NPM registry is too big to be downloaded in one go 50 ''' 51 52 super().__init__(request_handler)
Constructor of the class
chunks_folder
Folder where the chunks will be saved, This is because the NPM registry is too big to be downloaded in one go
@override
def
obtain_package_names( self, page_size: int = 100, show_progress_bar: bool = True, save_chunks: bool = False) -> List[dict]:
54 @override 55 def obtain_package_names( 56 self, 57 page_size: int = 100, 58 show_progress_bar: bool = True, 59 save_chunks: bool = False, 60 ) -> List[dict]: 61 62 ''' 63 Function to obtain the names of the packages in the NPM repository 64 65 Parameters 66 ---------- 67 page_size : int = 100 68 Number of packages to be requested in each page 69 show_progress_bar : bool = True 70 Flag to show the progress bar 71 save_chunks : bool = False 72 Flag to save the chunks of the registry in the chunks folder 73 74 Returns 75 ------- 76 List[dict] 77 List of dictionaries with the name and version of the packages 78 79 Examples 80 -------- 81 >>> scraper = NpmScraper() 82 >>> scraper.obtain_package_names() 83 ''' 84 # Get the total number of packages 85 response = requests.get(self.NPM_PACKAGE_REGISTRY_URL, timeout=30) 86 total_packages = response.json()['doc_count'] 87 88 # Calculate the number of pages (chunks) 89 num_pages = (total_packages // page_size) + 1 90 91 self.logger.debug(f'Total number of packages: {total_packages}') 92 self.logger.debug(f'Number of pages: {num_pages}') 93 94 # Initialize the progress bar if is set 95 progress_bar = tqdm(total=num_pages) if show_progress_bar else None 96 97 # Initialize the chunks folder if is set 98 if save_chunks: 99 self.logger.debug(f'Saving chunks at: {self.chunks_folder}') 100 self._init_chunks_folder() 101 102 # Obtain the names of the packages requesting the pages 103 pages = [] 104 last_key = None 105 for i in range(num_pages): 106 107 # Download the page 108 # Handle disconnects 109 page = None 110 while page is None: 111 try: 112 page = self._download_page(last_key, page_size) 113 except requests.exceptions.ConnectionError: 114 self.logger.debug(f'Connection error in page {i} of {num_pages}') 115 self.logger.debug(f'Last key: {last_key}') 116 self.logger.debug('Retrying...') 117 118 # check if the page is empty 119 if len(page) == 0: # type: ignore 120 self.logger.debug(f'Empty page {i} of {num_pages}') 121 self.logger.debug(f'Last key: {last_key}') 122 page = None 123 124 pages.append(page) 125 self.logger.debug(f'Downloaded page {i} of {num_pages}') 126 127 # get the last key of the page for the next iter 128 last_key = page[-1]['id'] 129 130 # Save chunk if is set 131 if save_chunks: 132 self.logger.debug(f'Saving chunk {i} of {num_pages}') 133 with open(f'{self.chunks_folder}/chunk_{i}.json', 'w') as f: 134 f.write(str(page)) 135 136 # Update progress bar if is set 137 progress_bar.update(1) if progress_bar is not None else None 138 139 package_names = [row['id'] for page in pages for row in page] 140 self.logger.info(f'Obtained {len(package_names)} packages from {self.NPM_PACKAGE_LIST_URL}') 141 return package_names
Function to obtain the names of the packages in the NPM repository
Parameters
- page_size (int = 100): Number of packages to be requested in each page
- show_progress_bar (bool = True): Flag to show the progress bar
- save_chunks (bool = False): Flag to save the chunks of the registry in the chunks folder
Returns
- List[dict]: List of dictionaries with the name and version of the packages
Examples
>>> scraper = NpmScraper()
>>> scraper.obtain_package_names()