olivia_finder.data_source.repository_scrapers.bioconductor
1from typing import Dict, List 2import requests 3from bs4 import BeautifulSoup 4from typing_extensions import override 5 6from ...utilities.exception import OliviaFinderException 7from . import r 8from ..scraper_ds import ScraperDataSource 9from ...myrequests.request_handler import RequestHandler 10from ...utilities.utilities import clean_string 11 12# Selenium imports (for scraping JavaScript pages) 13from selenium import webdriver 14from selenium.webdriver.common.by import By 15 16class BioconductorScraper(ScraperDataSource): 17 ''' 18 Class to scrape data from Bioconductor packages 19 20 Parameters 21 ---------- 22 23 request_handler : RequestHandler = None 24 Request handler for the scraper, if None, it will be initialized with a generic RequestHandler 25 26 Attributes 27 ---------- 28 BIOCONDUCTOR_LIST_URL : str 29 The URL of the page with the list of Bioconductor packages 30 BIOCONDUCTOR_PACKAGE_DATA_URL : str 31 The URL of the page with the data of each Bioconductor package 32 ''' 33 34 def __init__( 35 self, 36 request_handler: RequestHandler = None, 37 ): 38 ''' 39 Constructor 40 ''' 41 42 # Initialize the class variables 43 self.BIOCONDUCTOR_LIST_URL = 'https://www.bioconductor.org/packages/release/BiocViews.html#___Software' 44 self.BIOCONDUCTOR_PACKAGE_DATA_URL = 'https://www.bioconductor.org/packages/release/bioc/html/' 45 46 # Call the constructor of the parent class 47 super().__init__(request_handler) 48 49 50 51 @override 52 def obtain_package_names(self) -> List[str]: 53 ''' 54 Get the list of packages from the Bioconductor website 55 56 Returns 57 ------- 58 List[str] 59 List of package names 60 61 Raises 62 ------ 63 OliviaFinderException 64 If the list of packages cannot be obtained 65 66 Example 67 ------- 68 >>> scraper = BioconductorScraper() 69 >>> package_names = scraper.obtain_package_names() 70 ''' 71 72 # # Make HTTP request to the list of packages page 73 # # Is necessary to use Selenium because the list of packages is loaded dynamically 74 # # with JavaScript, we need to render the page to get the list of packages 75 76 # # Load the Selenium driver 77 # driver = webdriver.Chrome(service=Service(ChromeDriverManager().install())) 78 79 80 # Create the driver 81 try: 82 self.logger.debug("Creating the Selenium driver...") 83 84 driver_options = webdriver.FirefoxOptions() 85 driver_options.headless = True 86 driver = webdriver.Firefox( 87 options = driver_options 88 ) 89 except Exception as e: 90 raise OliviaFinderException("Exception occurred while creating the Selenium driver") from e 91 92 # Scraping webpage with package list 93 try: 94 self.logger.debug("Scraping the Bioconductor website...") 95 driver.get(self.BIOCONDUCTOR_LIST_URL) 96 table = driver.find_element(By.ID, "biocViews_package_table") 97 table_content = table.get_attribute("innerHTML") 98 except Exception as e: 99 raise OliviaFinderException("Exception occurred while scraping the Bioconductor website") from e 100 101 # Close the driver 102 driver.close() 103 104 # Process the HTML to obtain packages 105 try: 106 self.logger.debug("Processing the HTML...") 107 soup = BeautifulSoup(table_content, 'html.parser') 108 packages = [] 109 for row in soup.find_all("tr"): 110 packages.extend( 111 cell.find("a").text 112 for cell in row.find_all("td") 113 if cell.find("a") 114 ) 115 except Exception as e: 116 raise OliviaFinderException("Exception occurred while processing the HTML.") from e 117 118 # Sort the list of packages 119 packages.sort() 120 self.logger.info(f"Obtained {len(packages)} packages from {self.BIOCONDUCTOR_LIST_URL}") 121 122 return packages 123 124 @override 125 def _build_url(self, package_name: str) -> str: 126 ''' 127 Build the URL of the package page in the Bioconductor website 128 129 Parameters 130 ---------- 131 package_name : str 132 The name of the package 133 134 Returns 135 ------- 136 str 137 The URL of the package page in the Bioconductor website 138 ''' 139 return f'{self.BIOCONDUCTOR_PACKAGE_DATA_URL}{package_name}.html' 140 141 @override 142 def _parser(self, response: requests.Response) -> Dict[str, str]: 143 ''' 144 Parse the response from the Bioconductor website 145 It's obtained from the list of packages in the Bioconductor website 146 147 Parameters 148 ---------- 149 response : requests.Response 150 The response from the Bioconductor website 151 152 Returns 153 ------- 154 Dict[str, str] 155 The data of the package 156 ''' 157 # Get the data from the table 158 soup = BeautifulSoup(response.text, 'html.parser') 159 160 name = soup.find('h1').text.strip() 161 url = response.url 162 163 table = soup.find('table', class_='details') 164 rows = table.find_all('tr') 165 166 # For each row, we get the cells if they are of interest 167 dep_list = [] 168 imp_list = [] 169 for row in rows: 170 cells = row.find_all('td') 171 if len(cells) > 0: 172 if cells[0].text == 'Version': 173 version = clean_string(cells[1].text.strip()) 174 elif cells[0].text == 'Depends': 175 depends = clean_string(cells[1].text.strip()) 176 if depends != '': 177 dep_list = r.parse_dependencies(depends) 178 elif cells[0].text == 'Imports': 179 imports = clean_string(cells[1].text.strip()) 180 if imports != '': 181 imp_list = r.parse_dependencies(imports) 182 183 # Remove duplicates from the dependencies 184 for dep in dep_list: 185 if dep in imp_list: 186 imp_list.remove(dep) 187 188 # Return the data 189 return { 190 'name': name, 191 'version': version, 192 'dependencies': list(dep_list + imp_list), 193 'url': url 194 }
17class BioconductorScraper(ScraperDataSource): 18 ''' 19 Class to scrape data from Bioconductor packages 20 21 Parameters 22 ---------- 23 24 request_handler : RequestHandler = None 25 Request handler for the scraper, if None, it will be initialized with a generic RequestHandler 26 27 Attributes 28 ---------- 29 BIOCONDUCTOR_LIST_URL : str 30 The URL of the page with the list of Bioconductor packages 31 BIOCONDUCTOR_PACKAGE_DATA_URL : str 32 The URL of the page with the data of each Bioconductor package 33 ''' 34 35 def __init__( 36 self, 37 request_handler: RequestHandler = None, 38 ): 39 ''' 40 Constructor 41 ''' 42 43 # Initialize the class variables 44 self.BIOCONDUCTOR_LIST_URL = 'https://www.bioconductor.org/packages/release/BiocViews.html#___Software' 45 self.BIOCONDUCTOR_PACKAGE_DATA_URL = 'https://www.bioconductor.org/packages/release/bioc/html/' 46 47 # Call the constructor of the parent class 48 super().__init__(request_handler) 49 50 51 52 @override 53 def obtain_package_names(self) -> List[str]: 54 ''' 55 Get the list of packages from the Bioconductor website 56 57 Returns 58 ------- 59 List[str] 60 List of package names 61 62 Raises 63 ------ 64 OliviaFinderException 65 If the list of packages cannot be obtained 66 67 Example 68 ------- 69 >>> scraper = BioconductorScraper() 70 >>> package_names = scraper.obtain_package_names() 71 ''' 72 73 # # Make HTTP request to the list of packages page 74 # # Is necessary to use Selenium because the list of packages is loaded dynamically 75 # # with JavaScript, we need to render the page to get the list of packages 76 77 # # Load the Selenium driver 78 # driver = webdriver.Chrome(service=Service(ChromeDriverManager().install())) 79 80 81 # Create the driver 82 try: 83 self.logger.debug("Creating the Selenium driver...") 84 85 driver_options = webdriver.FirefoxOptions() 86 driver_options.headless = True 87 driver = webdriver.Firefox( 88 options = driver_options 89 ) 90 except Exception as e: 91 raise OliviaFinderException("Exception occurred while creating the Selenium driver") from e 92 93 # Scraping webpage with package list 94 try: 95 self.logger.debug("Scraping the Bioconductor website...") 96 driver.get(self.BIOCONDUCTOR_LIST_URL) 97 table = driver.find_element(By.ID, "biocViews_package_table") 98 table_content = table.get_attribute("innerHTML") 99 except Exception as e: 100 raise OliviaFinderException("Exception occurred while scraping the Bioconductor website") from e 101 102 # Close the driver 103 driver.close() 104 105 # Process the HTML to obtain packages 106 try: 107 self.logger.debug("Processing the HTML...") 108 soup = BeautifulSoup(table_content, 'html.parser') 109 packages = [] 110 for row in soup.find_all("tr"): 111 packages.extend( 112 cell.find("a").text 113 for cell in row.find_all("td") 114 if cell.find("a") 115 ) 116 except Exception as e: 117 raise OliviaFinderException("Exception occurred while processing the HTML.") from e 118 119 # Sort the list of packages 120 packages.sort() 121 self.logger.info(f"Obtained {len(packages)} packages from {self.BIOCONDUCTOR_LIST_URL}") 122 123 return packages 124 125 @override 126 def _build_url(self, package_name: str) -> str: 127 ''' 128 Build the URL of the package page in the Bioconductor website 129 130 Parameters 131 ---------- 132 package_name : str 133 The name of the package 134 135 Returns 136 ------- 137 str 138 The URL of the package page in the Bioconductor website 139 ''' 140 return f'{self.BIOCONDUCTOR_PACKAGE_DATA_URL}{package_name}.html' 141 142 @override 143 def _parser(self, response: requests.Response) -> Dict[str, str]: 144 ''' 145 Parse the response from the Bioconductor website 146 It's obtained from the list of packages in the Bioconductor website 147 148 Parameters 149 ---------- 150 response : requests.Response 151 The response from the Bioconductor website 152 153 Returns 154 ------- 155 Dict[str, str] 156 The data of the package 157 ''' 158 # Get the data from the table 159 soup = BeautifulSoup(response.text, 'html.parser') 160 161 name = soup.find('h1').text.strip() 162 url = response.url 163 164 table = soup.find('table', class_='details') 165 rows = table.find_all('tr') 166 167 # For each row, we get the cells if they are of interest 168 dep_list = [] 169 imp_list = [] 170 for row in rows: 171 cells = row.find_all('td') 172 if len(cells) > 0: 173 if cells[0].text == 'Version': 174 version = clean_string(cells[1].text.strip()) 175 elif cells[0].text == 'Depends': 176 depends = clean_string(cells[1].text.strip()) 177 if depends != '': 178 dep_list = r.parse_dependencies(depends) 179 elif cells[0].text == 'Imports': 180 imports = clean_string(cells[1].text.strip()) 181 if imports != '': 182 imp_list = r.parse_dependencies(imports) 183 184 # Remove duplicates from the dependencies 185 for dep in dep_list: 186 if dep in imp_list: 187 imp_list.remove(dep) 188 189 # Return the data 190 return { 191 'name': name, 192 'version': version, 193 'dependencies': list(dep_list + imp_list), 194 'url': url 195 }
Class to scrape data from Bioconductor packages
Parameters
- request_handler (RequestHandler = None): Request handler for the scraper, if None, it will be initialized with a generic RequestHandler
Attributes
- BIOCONDUCTOR_LIST_URL (str): The URL of the page with the list of Bioconductor packages
- BIOCONDUCTOR_PACKAGE_DATA_URL (str): The URL of the page with the data of each Bioconductor package
BioconductorScraper( request_handler: olivia_finder.myrequests.request_handler.RequestHandler = None)
35 def __init__( 36 self, 37 request_handler: RequestHandler = None, 38 ): 39 ''' 40 Constructor 41 ''' 42 43 # Initialize the class variables 44 self.BIOCONDUCTOR_LIST_URL = 'https://www.bioconductor.org/packages/release/BiocViews.html#___Software' 45 self.BIOCONDUCTOR_PACKAGE_DATA_URL = 'https://www.bioconductor.org/packages/release/bioc/html/' 46 47 # Call the constructor of the parent class 48 super().__init__(request_handler)
Constructor
@override
def
obtain_package_names(self) -> List[str]:
52 @override 53 def obtain_package_names(self) -> List[str]: 54 ''' 55 Get the list of packages from the Bioconductor website 56 57 Returns 58 ------- 59 List[str] 60 List of package names 61 62 Raises 63 ------ 64 OliviaFinderException 65 If the list of packages cannot be obtained 66 67 Example 68 ------- 69 >>> scraper = BioconductorScraper() 70 >>> package_names = scraper.obtain_package_names() 71 ''' 72 73 # # Make HTTP request to the list of packages page 74 # # Is necessary to use Selenium because the list of packages is loaded dynamically 75 # # with JavaScript, we need to render the page to get the list of packages 76 77 # # Load the Selenium driver 78 # driver = webdriver.Chrome(service=Service(ChromeDriverManager().install())) 79 80 81 # Create the driver 82 try: 83 self.logger.debug("Creating the Selenium driver...") 84 85 driver_options = webdriver.FirefoxOptions() 86 driver_options.headless = True 87 driver = webdriver.Firefox( 88 options = driver_options 89 ) 90 except Exception as e: 91 raise OliviaFinderException("Exception occurred while creating the Selenium driver") from e 92 93 # Scraping webpage with package list 94 try: 95 self.logger.debug("Scraping the Bioconductor website...") 96 driver.get(self.BIOCONDUCTOR_LIST_URL) 97 table = driver.find_element(By.ID, "biocViews_package_table") 98 table_content = table.get_attribute("innerHTML") 99 except Exception as e: 100 raise OliviaFinderException("Exception occurred while scraping the Bioconductor website") from e 101 102 # Close the driver 103 driver.close() 104 105 # Process the HTML to obtain packages 106 try: 107 self.logger.debug("Processing the HTML...") 108 soup = BeautifulSoup(table_content, 'html.parser') 109 packages = [] 110 for row in soup.find_all("tr"): 111 packages.extend( 112 cell.find("a").text 113 for cell in row.find_all("td") 114 if cell.find("a") 115 ) 116 except Exception as e: 117 raise OliviaFinderException("Exception occurred while processing the HTML.") from e 118 119 # Sort the list of packages 120 packages.sort() 121 self.logger.info(f"Obtained {len(packages)} packages from {self.BIOCONDUCTOR_LIST_URL}") 122 123 return packages
Get the list of packages from the Bioconductor website
Returns
- List[str]: List of package names
Raises
- OliviaFinderException: If the list of packages cannot be obtained
Example
>>> scraper = BioconductorScraper()
>>> package_names = scraper.obtain_package_names()