olivia_finder.data_source.repository_scrapers.cran

  1import requests
  2from typing_extensions import override
  3from bs4 import BeautifulSoup
  4from typing import Dict, List, Optional, Union
  5
  6from . import r
  7from ..scraper_ds import ScraperDataSource
  8from ...myrequests.request_handler import RequestHandler
  9from ...myrequests.job import RequestJob
 10from ...utilities.utilities import clean_string
 11
 12class CranScraper(ScraperDataSource):
 13    '''
 14    Class that scrapes the CRAN website to obtain information about R packages.
 15    Implements the abstract methods of the ScraperDataSource class.
 16    '''
 17
 18    # Class variables
 19
 20    def __init__(
 21        self, 
 22        request_handler: Optional[RequestHandler] = None
 23    ):
 24        '''
 25        Constructor of the class
 26
 27        Parameters
 28        ----------
 29        request_handler : Optional[RequestHandler], optional
 30            
 31        '''
 32
 33        # We initialize the class variables
 34        self.CRAN_PACKAGE_LIST_URL: str  = "https://cran.r-project.org/web/packages/available_packages_by_name.html"
 35        self.CRAN_PACKAGE_DATA_URL: str  = "https://cran.r-project.org/package="
 36
 37        # We call the constructor of the parent class
 38        super().__init__(request_handler)
 39
 40    @override
 41    def obtain_package_names(self) -> List[str]:
 42        '''
 43        Get the list of packages in the CRAN website, by scraping the HTML of the page
 44
 45        Returns
 46        -------
 47        List[str]
 48            List of packages
 49            
 50        Examples
 51        --------
 52        >>> from olivia_finder.scraping.cran import CranScraper
 53        >>> cs = CranScraper()
 54        >>> package_names = cs.obtain_package_names()
 55        '''
 56
 57        job = self.request_handler.do_request(
 58            RequestJob(
 59                "CRAN Package List",
 60                self.CRAN_PACKAGE_LIST_URL
 61            )
 62        )
 63
 64        if job.response is None:
 65            self.logger.error('Error while obtaining the list of packages from CRAN')
 66            return []
 67
 68        # Parse HTML
 69        soup = BeautifulSoup(job.response.text, 'html.parser')
 70
 71        # Get table with packages
 72        table = soup.find("table")
 73        rows = table.find_all("tr")
 74
 75        # Clean the first row of the table (it contains the headers)
 76        rows.pop(0)
 77
 78        packages = []
 79
 80        # We iterate over each row of the table to get the names of the packages
 81        for row in rows:
 82            if not (cells := row.find_all("td")):
 83                continue
 84            try: 
 85                # Get the name of the package
 86                package_name = cells[0].find("a").text   
 87
 88            # If an error occurs, we show the error message
 89            except Exception as e:
 90                self.logger.debug(f'Error while obtaining the name of a package: {e}')
 91                self.logger.debug(f'Row: {row}')
 92                continue
 93
 94            # We add the package name to the list of packages
 95            packages.append(package_name)
 96            self.logger.debug(f'Package {package_name} added to the list of packages')
 97
 98        self.logger.info(f'Obtained {len(packages)} packages from {self.CRAN_PACKAGE_LIST_URL}')
 99        return packages
100
101    @override
102    def _build_url(self, package_name: str) -> str:
103        '''
104        Build the URL of a package page in the CRAN website
105
106        Parameters
107        ----------
108        package_name : str
109            Name of the package
110
111        Returns
112        -------
113        str
114            URL of the package page
115
116        '''
117        return f'{self.CRAN_PACKAGE_DATA_URL}{package_name}'
118
119    @override
120    def _parser(self, response: requests.Response) -> Optional[Dict]:
121        '''
122        Parse the HTML of a package page in the CRAN website
123
124        Parameters
125        ----------
126        response : requests.Response
127            Response of the HTTP request to the package page
128
129        Returns
130        -------
131        Dict
132            Dictionary with the information of the package, or
133        None
134            if an error occurs
135
136        '''
137        soup = BeautifulSoup(response.text, 'html.parser')
138
139        # Get package name
140        name = None
141        try:
142            d = soup.find('h2').text
143            name = clean_string(d).split(':')[0]
144        except Exception as e:
145            self.logger.debug(f'Response without package name: {e}')
146            return None
147
148        # Get package version
149        version = None
150        try:
151            d = soup.find('td', text='Version:').find_next_sibling('td').text
152            version = clean_string(d)
153        except Exception as e:
154            self.logger.debug(f'Response without package version: {e}')
155
156        # Get depends
157        dep_list = []
158        try:
159            d = soup.find('td', text='Depends:').find_next_sibling('td').text
160            depends = clean_string(d)
161            dep_list = r.parse_dependencies(depends)
162        except Exception as e:
163            self.logger.debug(f'Response without package dependencies: {e}')
164
165        # Get imports
166        imp_list = []
167        try:
168            d = soup.find('td', text='Imports:').find_next_sibling('td').text
169            imports = clean_string(d)
170            imp_list = r.parse_dependencies(imports)
171        except Exception as e:
172            self.logger.debug(f'Response without package imports: {e}')
173            
174        # Build dictionary with package data
175        # we consider that dependencies and imports are the same level of importance
176        # so we add them to the same list
177        dependencies = list(dep_list + imp_list)
178
179        return {
180            'name': name,
181            'version': version,
182            'dependencies': dependencies,
183            'url': f'{self.CRAN_PACKAGE_DATA_URL}{name}'
184        }
185
186    
 13class CranScraper(ScraperDataSource):
 14    '''
 15    Class that scrapes the CRAN website to obtain information about R packages.
 16    Implements the abstract methods of the ScraperDataSource class.
 17    '''
 18
 19    # Class variables
 20
 21    def __init__(
 22        self, 
 23        request_handler: Optional[RequestHandler] = None
 24    ):
 25        '''
 26        Constructor of the class
 27
 28        Parameters
 29        ----------
 30        request_handler : Optional[RequestHandler], optional
 31            
 32        '''
 33
 34        # We initialize the class variables
 35        self.CRAN_PACKAGE_LIST_URL: str  = "https://cran.r-project.org/web/packages/available_packages_by_name.html"
 36        self.CRAN_PACKAGE_DATA_URL: str  = "https://cran.r-project.org/package="
 37
 38        # We call the constructor of the parent class
 39        super().__init__(request_handler)
 40
 41    @override
 42    def obtain_package_names(self) -> List[str]:
 43        '''
 44        Get the list of packages in the CRAN website, by scraping the HTML of the page
 45
 46        Returns
 47        -------
 48        List[str]
 49            List of packages
 50            
 51        Examples
 52        --------
 53        >>> from olivia_finder.scraping.cran import CranScraper
 54        >>> cs = CranScraper()
 55        >>> package_names = cs.obtain_package_names()
 56        '''
 57
 58        job = self.request_handler.do_request(
 59            RequestJob(
 60                "CRAN Package List",
 61                self.CRAN_PACKAGE_LIST_URL
 62            )
 63        )
 64
 65        if job.response is None:
 66            self.logger.error('Error while obtaining the list of packages from CRAN')
 67            return []
 68
 69        # Parse HTML
 70        soup = BeautifulSoup(job.response.text, 'html.parser')
 71
 72        # Get table with packages
 73        table = soup.find("table")
 74        rows = table.find_all("tr")
 75
 76        # Clean the first row of the table (it contains the headers)
 77        rows.pop(0)
 78
 79        packages = []
 80
 81        # We iterate over each row of the table to get the names of the packages
 82        for row in rows:
 83            if not (cells := row.find_all("td")):
 84                continue
 85            try: 
 86                # Get the name of the package
 87                package_name = cells[0].find("a").text   
 88
 89            # If an error occurs, we show the error message
 90            except Exception as e:
 91                self.logger.debug(f'Error while obtaining the name of a package: {e}')
 92                self.logger.debug(f'Row: {row}')
 93                continue
 94
 95            # We add the package name to the list of packages
 96            packages.append(package_name)
 97            self.logger.debug(f'Package {package_name} added to the list of packages')
 98
 99        self.logger.info(f'Obtained {len(packages)} packages from {self.CRAN_PACKAGE_LIST_URL}')
100        return packages
101
102    @override
103    def _build_url(self, package_name: str) -> str:
104        '''
105        Build the URL of a package page in the CRAN website
106
107        Parameters
108        ----------
109        package_name : str
110            Name of the package
111
112        Returns
113        -------
114        str
115            URL of the package page
116
117        '''
118        return f'{self.CRAN_PACKAGE_DATA_URL}{package_name}'
119
120    @override
121    def _parser(self, response: requests.Response) -> Optional[Dict]:
122        '''
123        Parse the HTML of a package page in the CRAN website
124
125        Parameters
126        ----------
127        response : requests.Response
128            Response of the HTTP request to the package page
129
130        Returns
131        -------
132        Dict
133            Dictionary with the information of the package, or
134        None
135            if an error occurs
136
137        '''
138        soup = BeautifulSoup(response.text, 'html.parser')
139
140        # Get package name
141        name = None
142        try:
143            d = soup.find('h2').text
144            name = clean_string(d).split(':')[0]
145        except Exception as e:
146            self.logger.debug(f'Response without package name: {e}')
147            return None
148
149        # Get package version
150        version = None
151        try:
152            d = soup.find('td', text='Version:').find_next_sibling('td').text
153            version = clean_string(d)
154        except Exception as e:
155            self.logger.debug(f'Response without package version: {e}')
156
157        # Get depends
158        dep_list = []
159        try:
160            d = soup.find('td', text='Depends:').find_next_sibling('td').text
161            depends = clean_string(d)
162            dep_list = r.parse_dependencies(depends)
163        except Exception as e:
164            self.logger.debug(f'Response without package dependencies: {e}')
165
166        # Get imports
167        imp_list = []
168        try:
169            d = soup.find('td', text='Imports:').find_next_sibling('td').text
170            imports = clean_string(d)
171            imp_list = r.parse_dependencies(imports)
172        except Exception as e:
173            self.logger.debug(f'Response without package imports: {e}')
174            
175        # Build dictionary with package data
176        # we consider that dependencies and imports are the same level of importance
177        # so we add them to the same list
178        dependencies = list(dep_list + imp_list)
179
180        return {
181            'name': name,
182            'version': version,
183            'dependencies': dependencies,
184            'url': f'{self.CRAN_PACKAGE_DATA_URL}{name}'
185        }

Class that scrapes the CRAN website to obtain information about R packages. Implements the abstract methods of the ScraperDataSource class.

CranScraper( request_handler: Optional[olivia_finder.myrequests.request_handler.RequestHandler] = None)
21    def __init__(
22        self, 
23        request_handler: Optional[RequestHandler] = None
24    ):
25        '''
26        Constructor of the class
27
28        Parameters
29        ----------
30        request_handler : Optional[RequestHandler], optional
31            
32        '''
33
34        # We initialize the class variables
35        self.CRAN_PACKAGE_LIST_URL: str  = "https://cran.r-project.org/web/packages/available_packages_by_name.html"
36        self.CRAN_PACKAGE_DATA_URL: str  = "https://cran.r-project.org/package="
37
38        # We call the constructor of the parent class
39        super().__init__(request_handler)

Constructor of the class

Parameters
  • request_handler (Optional[RequestHandler], optional):
@override
def obtain_package_names(self) -> List[str]:
 41    @override
 42    def obtain_package_names(self) -> List[str]:
 43        '''
 44        Get the list of packages in the CRAN website, by scraping the HTML of the page
 45
 46        Returns
 47        -------
 48        List[str]
 49            List of packages
 50            
 51        Examples
 52        --------
 53        >>> from olivia_finder.scraping.cran import CranScraper
 54        >>> cs = CranScraper()
 55        >>> package_names = cs.obtain_package_names()
 56        '''
 57
 58        job = self.request_handler.do_request(
 59            RequestJob(
 60                "CRAN Package List",
 61                self.CRAN_PACKAGE_LIST_URL
 62            )
 63        )
 64
 65        if job.response is None:
 66            self.logger.error('Error while obtaining the list of packages from CRAN')
 67            return []
 68
 69        # Parse HTML
 70        soup = BeautifulSoup(job.response.text, 'html.parser')
 71
 72        # Get table with packages
 73        table = soup.find("table")
 74        rows = table.find_all("tr")
 75
 76        # Clean the first row of the table (it contains the headers)
 77        rows.pop(0)
 78
 79        packages = []
 80
 81        # We iterate over each row of the table to get the names of the packages
 82        for row in rows:
 83            if not (cells := row.find_all("td")):
 84                continue
 85            try: 
 86                # Get the name of the package
 87                package_name = cells[0].find("a").text   
 88
 89            # If an error occurs, we show the error message
 90            except Exception as e:
 91                self.logger.debug(f'Error while obtaining the name of a package: {e}')
 92                self.logger.debug(f'Row: {row}')
 93                continue
 94
 95            # We add the package name to the list of packages
 96            packages.append(package_name)
 97            self.logger.debug(f'Package {package_name} added to the list of packages')
 98
 99        self.logger.info(f'Obtained {len(packages)} packages from {self.CRAN_PACKAGE_LIST_URL}')
100        return packages

Get the list of packages in the CRAN website, by scraping the HTML of the page

Returns
  • List[str]: List of packages
Examples
>>> from olivia_finder.scraping.cran import CranScraper
>>> cs = CranScraper()
>>> package_names = cs.obtain_package_names()