olivia_finder.data_source.repository_scrapers.cran
1import requests 2from typing_extensions import override 3from bs4 import BeautifulSoup 4from typing import Dict, List, Optional, Union 5 6from . import r 7from ..scraper_ds import ScraperDataSource 8from ...myrequests.request_handler import RequestHandler 9from ...myrequests.job import RequestJob 10from ...utilities.utilities import clean_string 11 12class CranScraper(ScraperDataSource): 13 ''' 14 Class that scrapes the CRAN website to obtain information about R packages. 15 Implements the abstract methods of the ScraperDataSource class. 16 ''' 17 18 # Class variables 19 20 def __init__( 21 self, 22 request_handler: Optional[RequestHandler] = None 23 ): 24 ''' 25 Constructor of the class 26 27 Parameters 28 ---------- 29 request_handler : Optional[RequestHandler], optional 30 31 ''' 32 33 # We initialize the class variables 34 self.CRAN_PACKAGE_LIST_URL: str = "https://cran.r-project.org/web/packages/available_packages_by_name.html" 35 self.CRAN_PACKAGE_DATA_URL: str = "https://cran.r-project.org/package=" 36 37 # We call the constructor of the parent class 38 super().__init__(request_handler) 39 40 @override 41 def obtain_package_names(self) -> List[str]: 42 ''' 43 Get the list of packages in the CRAN website, by scraping the HTML of the page 44 45 Returns 46 ------- 47 List[str] 48 List of packages 49 50 Examples 51 -------- 52 >>> from olivia_finder.scraping.cran import CranScraper 53 >>> cs = CranScraper() 54 >>> package_names = cs.obtain_package_names() 55 ''' 56 57 job = self.request_handler.do_request( 58 RequestJob( 59 "CRAN Package List", 60 self.CRAN_PACKAGE_LIST_URL 61 ) 62 ) 63 64 if job.response is None: 65 self.logger.error('Error while obtaining the list of packages from CRAN') 66 return [] 67 68 # Parse HTML 69 soup = BeautifulSoup(job.response.text, 'html.parser') 70 71 # Get table with packages 72 table = soup.find("table") 73 rows = table.find_all("tr") 74 75 # Clean the first row of the table (it contains the headers) 76 rows.pop(0) 77 78 packages = [] 79 80 # We iterate over each row of the table to get the names of the packages 81 for row in rows: 82 if not (cells := row.find_all("td")): 83 continue 84 try: 85 # Get the name of the package 86 package_name = cells[0].find("a").text 87 88 # If an error occurs, we show the error message 89 except Exception as e: 90 self.logger.debug(f'Error while obtaining the name of a package: {e}') 91 self.logger.debug(f'Row: {row}') 92 continue 93 94 # We add the package name to the list of packages 95 packages.append(package_name) 96 self.logger.debug(f'Package {package_name} added to the list of packages') 97 98 self.logger.info(f'Obtained {len(packages)} packages from {self.CRAN_PACKAGE_LIST_URL}') 99 return packages 100 101 @override 102 def _build_url(self, package_name: str) -> str: 103 ''' 104 Build the URL of a package page in the CRAN website 105 106 Parameters 107 ---------- 108 package_name : str 109 Name of the package 110 111 Returns 112 ------- 113 str 114 URL of the package page 115 116 ''' 117 return f'{self.CRAN_PACKAGE_DATA_URL}{package_name}' 118 119 @override 120 def _parser(self, response: requests.Response) -> Optional[Dict]: 121 ''' 122 Parse the HTML of a package page in the CRAN website 123 124 Parameters 125 ---------- 126 response : requests.Response 127 Response of the HTTP request to the package page 128 129 Returns 130 ------- 131 Dict 132 Dictionary with the information of the package, or 133 None 134 if an error occurs 135 136 ''' 137 soup = BeautifulSoup(response.text, 'html.parser') 138 139 # Get package name 140 name = None 141 try: 142 d = soup.find('h2').text 143 name = clean_string(d).split(':')[0] 144 except Exception as e: 145 self.logger.debug(f'Response without package name: {e}') 146 return None 147 148 # Get package version 149 version = None 150 try: 151 d = soup.find('td', text='Version:').find_next_sibling('td').text 152 version = clean_string(d) 153 except Exception as e: 154 self.logger.debug(f'Response without package version: {e}') 155 156 # Get depends 157 dep_list = [] 158 try: 159 d = soup.find('td', text='Depends:').find_next_sibling('td').text 160 depends = clean_string(d) 161 dep_list = r.parse_dependencies(depends) 162 except Exception as e: 163 self.logger.debug(f'Response without package dependencies: {e}') 164 165 # Get imports 166 imp_list = [] 167 try: 168 d = soup.find('td', text='Imports:').find_next_sibling('td').text 169 imports = clean_string(d) 170 imp_list = r.parse_dependencies(imports) 171 except Exception as e: 172 self.logger.debug(f'Response without package imports: {e}') 173 174 # Build dictionary with package data 175 # we consider that dependencies and imports are the same level of importance 176 # so we add them to the same list 177 dependencies = list(dep_list + imp_list) 178 179 return { 180 'name': name, 181 'version': version, 182 'dependencies': dependencies, 183 'url': f'{self.CRAN_PACKAGE_DATA_URL}{name}' 184 } 185 186
13class CranScraper(ScraperDataSource): 14 ''' 15 Class that scrapes the CRAN website to obtain information about R packages. 16 Implements the abstract methods of the ScraperDataSource class. 17 ''' 18 19 # Class variables 20 21 def __init__( 22 self, 23 request_handler: Optional[RequestHandler] = None 24 ): 25 ''' 26 Constructor of the class 27 28 Parameters 29 ---------- 30 request_handler : Optional[RequestHandler], optional 31 32 ''' 33 34 # We initialize the class variables 35 self.CRAN_PACKAGE_LIST_URL: str = "https://cran.r-project.org/web/packages/available_packages_by_name.html" 36 self.CRAN_PACKAGE_DATA_URL: str = "https://cran.r-project.org/package=" 37 38 # We call the constructor of the parent class 39 super().__init__(request_handler) 40 41 @override 42 def obtain_package_names(self) -> List[str]: 43 ''' 44 Get the list of packages in the CRAN website, by scraping the HTML of the page 45 46 Returns 47 ------- 48 List[str] 49 List of packages 50 51 Examples 52 -------- 53 >>> from olivia_finder.scraping.cran import CranScraper 54 >>> cs = CranScraper() 55 >>> package_names = cs.obtain_package_names() 56 ''' 57 58 job = self.request_handler.do_request( 59 RequestJob( 60 "CRAN Package List", 61 self.CRAN_PACKAGE_LIST_URL 62 ) 63 ) 64 65 if job.response is None: 66 self.logger.error('Error while obtaining the list of packages from CRAN') 67 return [] 68 69 # Parse HTML 70 soup = BeautifulSoup(job.response.text, 'html.parser') 71 72 # Get table with packages 73 table = soup.find("table") 74 rows = table.find_all("tr") 75 76 # Clean the first row of the table (it contains the headers) 77 rows.pop(0) 78 79 packages = [] 80 81 # We iterate over each row of the table to get the names of the packages 82 for row in rows: 83 if not (cells := row.find_all("td")): 84 continue 85 try: 86 # Get the name of the package 87 package_name = cells[0].find("a").text 88 89 # If an error occurs, we show the error message 90 except Exception as e: 91 self.logger.debug(f'Error while obtaining the name of a package: {e}') 92 self.logger.debug(f'Row: {row}') 93 continue 94 95 # We add the package name to the list of packages 96 packages.append(package_name) 97 self.logger.debug(f'Package {package_name} added to the list of packages') 98 99 self.logger.info(f'Obtained {len(packages)} packages from {self.CRAN_PACKAGE_LIST_URL}') 100 return packages 101 102 @override 103 def _build_url(self, package_name: str) -> str: 104 ''' 105 Build the URL of a package page in the CRAN website 106 107 Parameters 108 ---------- 109 package_name : str 110 Name of the package 111 112 Returns 113 ------- 114 str 115 URL of the package page 116 117 ''' 118 return f'{self.CRAN_PACKAGE_DATA_URL}{package_name}' 119 120 @override 121 def _parser(self, response: requests.Response) -> Optional[Dict]: 122 ''' 123 Parse the HTML of a package page in the CRAN website 124 125 Parameters 126 ---------- 127 response : requests.Response 128 Response of the HTTP request to the package page 129 130 Returns 131 ------- 132 Dict 133 Dictionary with the information of the package, or 134 None 135 if an error occurs 136 137 ''' 138 soup = BeautifulSoup(response.text, 'html.parser') 139 140 # Get package name 141 name = None 142 try: 143 d = soup.find('h2').text 144 name = clean_string(d).split(':')[0] 145 except Exception as e: 146 self.logger.debug(f'Response without package name: {e}') 147 return None 148 149 # Get package version 150 version = None 151 try: 152 d = soup.find('td', text='Version:').find_next_sibling('td').text 153 version = clean_string(d) 154 except Exception as e: 155 self.logger.debug(f'Response without package version: {e}') 156 157 # Get depends 158 dep_list = [] 159 try: 160 d = soup.find('td', text='Depends:').find_next_sibling('td').text 161 depends = clean_string(d) 162 dep_list = r.parse_dependencies(depends) 163 except Exception as e: 164 self.logger.debug(f'Response without package dependencies: {e}') 165 166 # Get imports 167 imp_list = [] 168 try: 169 d = soup.find('td', text='Imports:').find_next_sibling('td').text 170 imports = clean_string(d) 171 imp_list = r.parse_dependencies(imports) 172 except Exception as e: 173 self.logger.debug(f'Response without package imports: {e}') 174 175 # Build dictionary with package data 176 # we consider that dependencies and imports are the same level of importance 177 # so we add them to the same list 178 dependencies = list(dep_list + imp_list) 179 180 return { 181 'name': name, 182 'version': version, 183 'dependencies': dependencies, 184 'url': f'{self.CRAN_PACKAGE_DATA_URL}{name}' 185 }
Class that scrapes the CRAN website to obtain information about R packages. Implements the abstract methods of the ScraperDataSource class.
CranScraper( request_handler: Optional[olivia_finder.myrequests.request_handler.RequestHandler] = None)
21 def __init__( 22 self, 23 request_handler: Optional[RequestHandler] = None 24 ): 25 ''' 26 Constructor of the class 27 28 Parameters 29 ---------- 30 request_handler : Optional[RequestHandler], optional 31 32 ''' 33 34 # We initialize the class variables 35 self.CRAN_PACKAGE_LIST_URL: str = "https://cran.r-project.org/web/packages/available_packages_by_name.html" 36 self.CRAN_PACKAGE_DATA_URL: str = "https://cran.r-project.org/package=" 37 38 # We call the constructor of the parent class 39 super().__init__(request_handler)
Constructor of the class
Parameters
- request_handler (Optional[RequestHandler], optional):
@override
def
obtain_package_names(self) -> List[str]:
41 @override 42 def obtain_package_names(self) -> List[str]: 43 ''' 44 Get the list of packages in the CRAN website, by scraping the HTML of the page 45 46 Returns 47 ------- 48 List[str] 49 List of packages 50 51 Examples 52 -------- 53 >>> from olivia_finder.scraping.cran import CranScraper 54 >>> cs = CranScraper() 55 >>> package_names = cs.obtain_package_names() 56 ''' 57 58 job = self.request_handler.do_request( 59 RequestJob( 60 "CRAN Package List", 61 self.CRAN_PACKAGE_LIST_URL 62 ) 63 ) 64 65 if job.response is None: 66 self.logger.error('Error while obtaining the list of packages from CRAN') 67 return [] 68 69 # Parse HTML 70 soup = BeautifulSoup(job.response.text, 'html.parser') 71 72 # Get table with packages 73 table = soup.find("table") 74 rows = table.find_all("tr") 75 76 # Clean the first row of the table (it contains the headers) 77 rows.pop(0) 78 79 packages = [] 80 81 # We iterate over each row of the table to get the names of the packages 82 for row in rows: 83 if not (cells := row.find_all("td")): 84 continue 85 try: 86 # Get the name of the package 87 package_name = cells[0].find("a").text 88 89 # If an error occurs, we show the error message 90 except Exception as e: 91 self.logger.debug(f'Error while obtaining the name of a package: {e}') 92 self.logger.debug(f'Row: {row}') 93 continue 94 95 # We add the package name to the list of packages 96 packages.append(package_name) 97 self.logger.debug(f'Package {package_name} added to the list of packages') 98 99 self.logger.info(f'Obtained {len(packages)} packages from {self.CRAN_PACKAGE_LIST_URL}') 100 return packages
Get the list of packages in the CRAN website, by scraping the HTML of the page
Returns
- List[str]: List of packages
Examples
>>> from olivia_finder.scraping.cran import CranScraper
>>> cs = CranScraper()
>>> package_names = cs.obtain_package_names()