olivia_finder.data_source.repository_scrapers.npm

  1import os
  2from typing import List, Optional
  3import requests
  4from typing_extensions import override
  5from tqdm import tqdm
  6
  7from ..scraper_ds import ScraperDataSource
  8from ...utilities.config import Configuration
  9from ...myrequests.request_handler import RequestHandler
 10from ...myrequests.job import RequestJob
 11
 12
 13class NpmScraper(ScraperDataSource):
 14    '''
 15    Class that scrapes the NPM website to obtain information about JavaScript packages
 16    
 17    Attributes
 18    ----------
 19    NPM_PACKAGE_REGISTRY_URL : str
 20        URL of the page that contains the list of packages
 21    NPM_PACKAGE_LIST_URL : str
 22        URL of the page that contains the data of a package
 23    NPM_REPO_URL : str
 24        URL of the page that contains the data of a package
 25        
 26    Parameters
 27    ----------
 28    request_handler : RequestHandler = None
 29        Request handler for the scraper, if None, it will be initialized with a generic RequestHandler
 30    '''
 31
 32    # Constants
 33    NPM_PACKAGE_REGISTRY_URL: str   = 'https://registry.npmjs.org/'
 34    NPM_PACKAGE_LIST_URL: str       = 'https://skimdb.npmjs.com/registry/_all_docs'
 35    NPM_REPO_URL: str               = 'https://www.npmjs.com/package'
 36
 37    def __init__(
 38        self, 
 39        request_handler: Optional[RequestHandler] = None,
 40    ):
 41        '''
 42        Constructor of the class
 43        '''
 44
 45        self.chunks_folder = None
 46        '''
 47        Folder where the chunks will be saved, 
 48        This is because the NPM registry is too big to be downloaded in one go
 49        '''
 50
 51        super().__init__(request_handler)
 52
 53    @override
 54    def obtain_package_names(
 55        self, 
 56        page_size: int = 100, 
 57        show_progress_bar: bool = True,
 58        save_chunks: bool = False,
 59    ) -> List[dict]:
 60
 61        '''
 62        Function to obtain the names of the packages in the NPM repository
 63
 64        Parameters
 65        ----------
 66        page_size : int = 100
 67            Number of packages to be requested in each page
 68        show_progress_bar : bool = True
 69            Flag to show the progress bar
 70        save_chunks : bool = False
 71            Flag to save the chunks of the registry in the chunks folder
 72        
 73        Returns
 74        -------
 75        List[dict]
 76            List of dictionaries with the name and version of the packages
 77        
 78        Examples
 79        --------
 80        >>> scraper = NpmScraper()
 81        >>> scraper.obtain_package_names()
 82        '''
 83        # Get the total number of packages
 84        response = requests.get(self.NPM_PACKAGE_REGISTRY_URL, timeout=30)
 85        total_packages = response.json()['doc_count']
 86
 87        # Calculate the number of pages (chunks)
 88        num_pages = (total_packages // page_size) + 1
 89
 90        self.logger.debug(f'Total number of packages: {total_packages}')
 91        self.logger.debug(f'Number of pages: {num_pages}')
 92
 93        # Initialize the progress bar if is set
 94        progress_bar = tqdm(total=num_pages) if show_progress_bar else None
 95
 96        # Initialize the chunks folder if is set
 97        if save_chunks:
 98            self.logger.debug(f'Saving chunks at: {self.chunks_folder}')
 99            self._init_chunks_folder()
100
101        # Obtain the names of the packages requesting the pages
102        pages = []
103        last_key = None
104        for i in range(num_pages):
105
106            # Download the page
107            # Handle disconnects
108            page = None
109            while page is None:
110                try:
111                    page = self._download_page(last_key, page_size)
112                except requests.exceptions.ConnectionError:
113                    self.logger.debug(f'Connection error in page {i} of {num_pages}')
114                    self.logger.debug(f'Last key: {last_key}')
115                    self.logger.debug('Retrying...')
116
117                # check if the page is empty
118                if len(page) == 0: # type: ignore
119                    self.logger.debug(f'Empty page {i} of {num_pages}')
120                    self.logger.debug(f'Last key: {last_key}')
121                    page = None
122
123            pages.append(page)
124            self.logger.debug(f'Downloaded page {i} of {num_pages}')
125
126            # get the last key of the page for the next iter
127            last_key = page[-1]['id']
128
129            # Save chunk if is set
130            if save_chunks:
131                self.logger.debug(f'Saving chunk {i} of {num_pages}')
132                with open(f'{self.chunks_folder}/chunk_{i}.json', 'w') as f:
133                    f.write(str(page))            
134
135            # Update progress bar if is set
136            progress_bar.update(1) if progress_bar is not None else None
137
138        package_names = [row['id'] for page in pages for row in page]
139        self.logger.info(f'Obtained {len(package_names)} packages from {self.NPM_PACKAGE_LIST_URL}')
140        return package_names
141
142    def _init_chunks_folder(self):
143        '''
144        Function to initialize the chunks folder, where the chunks will be saved
145        Loads the path from the configuration file
146        '''
147        self.chunks_folder = f'{Configuration().get_key("folders", "working_dir")}/npm_package_names_chunks'
148        os.makedirs(self.chunks_folder, exist_ok=True)
149
150    def _download_page(
151        self, 
152        start_key: Optional[str] = None,
153        size: int = 1000, 
154        retries: int = 5
155    )-> List[dict]:
156        '''
157        Function to download a page of documents from the NPM repository and return a list of dictionaries with the name and version of the packages
158        
159        Parameters
160        ----------
161        start_key : str = None
162            Key to start the download
163        size : int = None
164            Size of the page to download
165        retries : int = None
166            Number of retries to download the page
167        
168        Returns
169        -------
170        List[dict]
171            List of dictionaries with the name and version of the packages
172        '''
173
174        # Fix for the first page
175        if start_key is None:
176            params = {'limit': size}
177        else:
178            encode_start_key = "\"" + start_key + "\""
179            params = {'startkey': encode_start_key, 'limit': size}
180
181        # Download the page
182        job = self.request_handler.do_request(
183            RequestJob(
184                key='npm_download_page',
185                url=self.NPM_PACKAGE_LIST_URL,
186                params=params,
187            )
188        )
189        
190        # If the response is None, return an empty list
191        if job.response is None:
192            self.logger.debug(f'None response at __download_page: url={self.NPM_PACKAGE_LIST_URL}')
193            return []
194                        
195        # If the response returns an error, return an empty list
196        try:
197            data = job.response.json()
198
199        except Exception as e:
200
201            msg = f'EXCEPTION at __download_page: url={self.NPM_PACKAGE_LIST_URL}\n'
202            msg += f'Response: {job.response.text}\n'
203            msg += f'Params: {params}\n'
204            msg += f'Retrying, times left: {retries}\n'
205            self.logger.debug(msg)
206            
207            return self._download_page(start_key, size, retries-1)
208            
209        if data.keys() == {'error', 'reason'}:
210            return self._download_page(start_key, size, retries-1)
211        else:
212            # Fix of selecting by last key
213            return data['rows'][1:]
214    
215    @override
216    def _build_url(self, package_name: str):
217        '''
218        Function to build the URL of the package
219
220        Parameters
221        ----------
222        package_name : str
223            Name of the package
224
225        Returns
226        -------
227        str
228            URL of the package
229        '''
230        return f'{self.NPM_PACKAGE_REGISTRY_URL}/{package_name}'
231
232    @override
233    def _parser(self, response: requests.Response) -> dict:
234        '''
235        Parse the response of the request
236
237        Parameters
238        ----------
239        response : requests.Response
240            Response of the request
241
242        Returns
243        -------
244        dict
245            dictionary with the parsed data
246
247        Examples
248        --------
249        >>> response = requests.get('https://registry.npmjs.org/express')
250        >>> parser(response)
251        {
252            'name': 'express',
253            'version': '4.17.1',
254            'dependencies': [
255                {'name': 'accepts', 'version': '1.3.7'},
256                {'name': 'array-flatten', 'version': '1.1.1'},
257                {'name': 'body-parser', 'version': '1.19.0'},
258                {'name': 'content-disposition', 'version': '0.5.3'},
259                {'name': 'content-type', 'version': '1.0.4'}
260            ],
261            'url': 'https://www.npmjs.com/package/express'
262        }
263        '''
264
265        response_json = response.json()
266
267        # Check if the package exists
268        if 'error' in response_json:
269            return {}
270
271        # Get the package name and version
272        try:
273            package_name = response_json['_id']
274        except KeyError:
275            # If the package does not have a name, return an empty dict
276            return {}
277        
278        try:
279            package_version = response_json['dist-tags']['latest']
280        except KeyError:
281            package_version = None
282
283        # get the dependencies
284        try:
285            dependencies = response_json['versions'][package_version]['dependencies']
286            if dependencies is None:
287                dependencies = {}
288        except KeyError:
289            dependencies = {}
290
291        dep_list = [
292            {'name': key, 'version': value} for key, value in dependencies.items()
293        ]
294
295        # get dev dependencies
296        try:
297            dev_dependencies = response_json['versions'][package_version]['devDependencies']
298            if dev_dependencies is None:
299                dev_dependencies = {}
300        except KeyError:
301            dev_dependencies = {}
302
303        dep_list += [
304            {'name': key, 'version': value} for key, value in dev_dependencies.items()
305        ]
306
307        return {
308            'name': package_name,
309            'version': package_version,
310            'dependencies': dep_list,
311            'url': f'{self.NPM_REPO_URL}/{package_name}'
312        }
313
314    
315    
 14class NpmScraper(ScraperDataSource):
 15    '''
 16    Class that scrapes the NPM website to obtain information about JavaScript packages
 17    
 18    Attributes
 19    ----------
 20    NPM_PACKAGE_REGISTRY_URL : str
 21        URL of the page that contains the list of packages
 22    NPM_PACKAGE_LIST_URL : str
 23        URL of the page that contains the data of a package
 24    NPM_REPO_URL : str
 25        URL of the page that contains the data of a package
 26        
 27    Parameters
 28    ----------
 29    request_handler : RequestHandler = None
 30        Request handler for the scraper, if None, it will be initialized with a generic RequestHandler
 31    '''
 32
 33    # Constants
 34    NPM_PACKAGE_REGISTRY_URL: str   = 'https://registry.npmjs.org/'
 35    NPM_PACKAGE_LIST_URL: str       = 'https://skimdb.npmjs.com/registry/_all_docs'
 36    NPM_REPO_URL: str               = 'https://www.npmjs.com/package'
 37
 38    def __init__(
 39        self, 
 40        request_handler: Optional[RequestHandler] = None,
 41    ):
 42        '''
 43        Constructor of the class
 44        '''
 45
 46        self.chunks_folder = None
 47        '''
 48        Folder where the chunks will be saved, 
 49        This is because the NPM registry is too big to be downloaded in one go
 50        '''
 51
 52        super().__init__(request_handler)
 53
 54    @override
 55    def obtain_package_names(
 56        self, 
 57        page_size: int = 100, 
 58        show_progress_bar: bool = True,
 59        save_chunks: bool = False,
 60    ) -> List[dict]:
 61
 62        '''
 63        Function to obtain the names of the packages in the NPM repository
 64
 65        Parameters
 66        ----------
 67        page_size : int = 100
 68            Number of packages to be requested in each page
 69        show_progress_bar : bool = True
 70            Flag to show the progress bar
 71        save_chunks : bool = False
 72            Flag to save the chunks of the registry in the chunks folder
 73        
 74        Returns
 75        -------
 76        List[dict]
 77            List of dictionaries with the name and version of the packages
 78        
 79        Examples
 80        --------
 81        >>> scraper = NpmScraper()
 82        >>> scraper.obtain_package_names()
 83        '''
 84        # Get the total number of packages
 85        response = requests.get(self.NPM_PACKAGE_REGISTRY_URL, timeout=30)
 86        total_packages = response.json()['doc_count']
 87
 88        # Calculate the number of pages (chunks)
 89        num_pages = (total_packages // page_size) + 1
 90
 91        self.logger.debug(f'Total number of packages: {total_packages}')
 92        self.logger.debug(f'Number of pages: {num_pages}')
 93
 94        # Initialize the progress bar if is set
 95        progress_bar = tqdm(total=num_pages) if show_progress_bar else None
 96
 97        # Initialize the chunks folder if is set
 98        if save_chunks:
 99            self.logger.debug(f'Saving chunks at: {self.chunks_folder}')
100            self._init_chunks_folder()
101
102        # Obtain the names of the packages requesting the pages
103        pages = []
104        last_key = None
105        for i in range(num_pages):
106
107            # Download the page
108            # Handle disconnects
109            page = None
110            while page is None:
111                try:
112                    page = self._download_page(last_key, page_size)
113                except requests.exceptions.ConnectionError:
114                    self.logger.debug(f'Connection error in page {i} of {num_pages}')
115                    self.logger.debug(f'Last key: {last_key}')
116                    self.logger.debug('Retrying...')
117
118                # check if the page is empty
119                if len(page) == 0: # type: ignore
120                    self.logger.debug(f'Empty page {i} of {num_pages}')
121                    self.logger.debug(f'Last key: {last_key}')
122                    page = None
123
124            pages.append(page)
125            self.logger.debug(f'Downloaded page {i} of {num_pages}')
126
127            # get the last key of the page for the next iter
128            last_key = page[-1]['id']
129
130            # Save chunk if is set
131            if save_chunks:
132                self.logger.debug(f'Saving chunk {i} of {num_pages}')
133                with open(f'{self.chunks_folder}/chunk_{i}.json', 'w') as f:
134                    f.write(str(page))            
135
136            # Update progress bar if is set
137            progress_bar.update(1) if progress_bar is not None else None
138
139        package_names = [row['id'] for page in pages for row in page]
140        self.logger.info(f'Obtained {len(package_names)} packages from {self.NPM_PACKAGE_LIST_URL}')
141        return package_names
142
143    def _init_chunks_folder(self):
144        '''
145        Function to initialize the chunks folder, where the chunks will be saved
146        Loads the path from the configuration file
147        '''
148        self.chunks_folder = f'{Configuration().get_key("folders", "working_dir")}/npm_package_names_chunks'
149        os.makedirs(self.chunks_folder, exist_ok=True)
150
151    def _download_page(
152        self, 
153        start_key: Optional[str] = None,
154        size: int = 1000, 
155        retries: int = 5
156    )-> List[dict]:
157        '''
158        Function to download a page of documents from the NPM repository and return a list of dictionaries with the name and version of the packages
159        
160        Parameters
161        ----------
162        start_key : str = None
163            Key to start the download
164        size : int = None
165            Size of the page to download
166        retries : int = None
167            Number of retries to download the page
168        
169        Returns
170        -------
171        List[dict]
172            List of dictionaries with the name and version of the packages
173        '''
174
175        # Fix for the first page
176        if start_key is None:
177            params = {'limit': size}
178        else:
179            encode_start_key = "\"" + start_key + "\""
180            params = {'startkey': encode_start_key, 'limit': size}
181
182        # Download the page
183        job = self.request_handler.do_request(
184            RequestJob(
185                key='npm_download_page',
186                url=self.NPM_PACKAGE_LIST_URL,
187                params=params,
188            )
189        )
190        
191        # If the response is None, return an empty list
192        if job.response is None:
193            self.logger.debug(f'None response at __download_page: url={self.NPM_PACKAGE_LIST_URL}')
194            return []
195                        
196        # If the response returns an error, return an empty list
197        try:
198            data = job.response.json()
199
200        except Exception as e:
201
202            msg = f'EXCEPTION at __download_page: url={self.NPM_PACKAGE_LIST_URL}\n'
203            msg += f'Response: {job.response.text}\n'
204            msg += f'Params: {params}\n'
205            msg += f'Retrying, times left: {retries}\n'
206            self.logger.debug(msg)
207            
208            return self._download_page(start_key, size, retries-1)
209            
210        if data.keys() == {'error', 'reason'}:
211            return self._download_page(start_key, size, retries-1)
212        else:
213            # Fix of selecting by last key
214            return data['rows'][1:]
215    
216    @override
217    def _build_url(self, package_name: str):
218        '''
219        Function to build the URL of the package
220
221        Parameters
222        ----------
223        package_name : str
224            Name of the package
225
226        Returns
227        -------
228        str
229            URL of the package
230        '''
231        return f'{self.NPM_PACKAGE_REGISTRY_URL}/{package_name}'
232
233    @override
234    def _parser(self, response: requests.Response) -> dict:
235        '''
236        Parse the response of the request
237
238        Parameters
239        ----------
240        response : requests.Response
241            Response of the request
242
243        Returns
244        -------
245        dict
246            dictionary with the parsed data
247
248        Examples
249        --------
250        >>> response = requests.get('https://registry.npmjs.org/express')
251        >>> parser(response)
252        {
253            'name': 'express',
254            'version': '4.17.1',
255            'dependencies': [
256                {'name': 'accepts', 'version': '1.3.7'},
257                {'name': 'array-flatten', 'version': '1.1.1'},
258                {'name': 'body-parser', 'version': '1.19.0'},
259                {'name': 'content-disposition', 'version': '0.5.3'},
260                {'name': 'content-type', 'version': '1.0.4'}
261            ],
262            'url': 'https://www.npmjs.com/package/express'
263        }
264        '''
265
266        response_json = response.json()
267
268        # Check if the package exists
269        if 'error' in response_json:
270            return {}
271
272        # Get the package name and version
273        try:
274            package_name = response_json['_id']
275        except KeyError:
276            # If the package does not have a name, return an empty dict
277            return {}
278        
279        try:
280            package_version = response_json['dist-tags']['latest']
281        except KeyError:
282            package_version = None
283
284        # get the dependencies
285        try:
286            dependencies = response_json['versions'][package_version]['dependencies']
287            if dependencies is None:
288                dependencies = {}
289        except KeyError:
290            dependencies = {}
291
292        dep_list = [
293            {'name': key, 'version': value} for key, value in dependencies.items()
294        ]
295
296        # get dev dependencies
297        try:
298            dev_dependencies = response_json['versions'][package_version]['devDependencies']
299            if dev_dependencies is None:
300                dev_dependencies = {}
301        except KeyError:
302            dev_dependencies = {}
303
304        dep_list += [
305            {'name': key, 'version': value} for key, value in dev_dependencies.items()
306        ]
307
308        return {
309            'name': package_name,
310            'version': package_version,
311            'dependencies': dep_list,
312            'url': f'{self.NPM_REPO_URL}/{package_name}'
313        }

Class that scrapes the NPM website to obtain information about JavaScript packages

Attributes
  • NPM_PACKAGE_REGISTRY_URL (str): URL of the page that contains the list of packages
  • NPM_PACKAGE_LIST_URL (str): URL of the page that contains the data of a package
  • NPM_REPO_URL (str): URL of the page that contains the data of a package
Parameters
  • request_handler (RequestHandler = None): Request handler for the scraper, if None, it will be initialized with a generic RequestHandler
NpmScraper( request_handler: Optional[olivia_finder.myrequests.request_handler.RequestHandler] = None)
38    def __init__(
39        self, 
40        request_handler: Optional[RequestHandler] = None,
41    ):
42        '''
43        Constructor of the class
44        '''
45
46        self.chunks_folder = None
47        '''
48        Folder where the chunks will be saved, 
49        This is because the NPM registry is too big to be downloaded in one go
50        '''
51
52        super().__init__(request_handler)

Constructor of the class

chunks_folder

Folder where the chunks will be saved, This is because the NPM registry is too big to be downloaded in one go

@override
def obtain_package_names( self, page_size: int = 100, show_progress_bar: bool = True, save_chunks: bool = False) -> List[dict]:
 54    @override
 55    def obtain_package_names(
 56        self, 
 57        page_size: int = 100, 
 58        show_progress_bar: bool = True,
 59        save_chunks: bool = False,
 60    ) -> List[dict]:
 61
 62        '''
 63        Function to obtain the names of the packages in the NPM repository
 64
 65        Parameters
 66        ----------
 67        page_size : int = 100
 68            Number of packages to be requested in each page
 69        show_progress_bar : bool = True
 70            Flag to show the progress bar
 71        save_chunks : bool = False
 72            Flag to save the chunks of the registry in the chunks folder
 73        
 74        Returns
 75        -------
 76        List[dict]
 77            List of dictionaries with the name and version of the packages
 78        
 79        Examples
 80        --------
 81        >>> scraper = NpmScraper()
 82        >>> scraper.obtain_package_names()
 83        '''
 84        # Get the total number of packages
 85        response = requests.get(self.NPM_PACKAGE_REGISTRY_URL, timeout=30)
 86        total_packages = response.json()['doc_count']
 87
 88        # Calculate the number of pages (chunks)
 89        num_pages = (total_packages // page_size) + 1
 90
 91        self.logger.debug(f'Total number of packages: {total_packages}')
 92        self.logger.debug(f'Number of pages: {num_pages}')
 93
 94        # Initialize the progress bar if is set
 95        progress_bar = tqdm(total=num_pages) if show_progress_bar else None
 96
 97        # Initialize the chunks folder if is set
 98        if save_chunks:
 99            self.logger.debug(f'Saving chunks at: {self.chunks_folder}')
100            self._init_chunks_folder()
101
102        # Obtain the names of the packages requesting the pages
103        pages = []
104        last_key = None
105        for i in range(num_pages):
106
107            # Download the page
108            # Handle disconnects
109            page = None
110            while page is None:
111                try:
112                    page = self._download_page(last_key, page_size)
113                except requests.exceptions.ConnectionError:
114                    self.logger.debug(f'Connection error in page {i} of {num_pages}')
115                    self.logger.debug(f'Last key: {last_key}')
116                    self.logger.debug('Retrying...')
117
118                # check if the page is empty
119                if len(page) == 0: # type: ignore
120                    self.logger.debug(f'Empty page {i} of {num_pages}')
121                    self.logger.debug(f'Last key: {last_key}')
122                    page = None
123
124            pages.append(page)
125            self.logger.debug(f'Downloaded page {i} of {num_pages}')
126
127            # get the last key of the page for the next iter
128            last_key = page[-1]['id']
129
130            # Save chunk if is set
131            if save_chunks:
132                self.logger.debug(f'Saving chunk {i} of {num_pages}')
133                with open(f'{self.chunks_folder}/chunk_{i}.json', 'w') as f:
134                    f.write(str(page))            
135
136            # Update progress bar if is set
137            progress_bar.update(1) if progress_bar is not None else None
138
139        package_names = [row['id'] for page in pages for row in page]
140        self.logger.info(f'Obtained {len(package_names)} packages from {self.NPM_PACKAGE_LIST_URL}')
141        return package_names

Function to obtain the names of the packages in the NPM repository

Parameters
  • page_size (int = 100): Number of packages to be requested in each page
  • show_progress_bar (bool = True): Flag to show the progress bar
  • save_chunks (bool = False): Flag to save the chunks of the registry in the chunks folder
Returns
  • List[dict]: List of dictionaries with the name and version of the packages
Examples
>>> scraper = NpmScraper()
>>> scraper.obtain_package_names()