olivia_finder.data_source.repository_scrapers.bioconductor

  1from typing import Dict, List
  2import requests
  3from bs4 import BeautifulSoup
  4from typing_extensions import override
  5
  6from ...utilities.exception import OliviaFinderException
  7from . import r
  8from ..scraper_ds import ScraperDataSource
  9from ...myrequests.request_handler import RequestHandler
 10from ...utilities.utilities import clean_string
 11
 12# Selenium imports (for scraping JavaScript pages)
 13from selenium import webdriver                                    
 14from selenium.webdriver.common.by import By
 15
 16class BioconductorScraper(ScraperDataSource):
 17    '''
 18    Class to scrape data from Bioconductor packages
 19    
 20    Parameters
 21    ----------
 22
 23    request_handler : RequestHandler = None
 24        Request handler for the scraper, if None, it will be initialized with a generic RequestHandler
 25        
 26    Attributes
 27    ----------
 28    BIOCONDUCTOR_LIST_URL : str
 29        The URL of the page with the list of Bioconductor packages
 30    BIOCONDUCTOR_PACKAGE_DATA_URL : str
 31        The URL of the page with the data of each Bioconductor package
 32    '''
 33    
 34    def __init__(
 35        self, 
 36        request_handler: RequestHandler = None,
 37    ):
 38        '''
 39        Constructor
 40        '''
 41
 42        # Initialize the class variables
 43        self.BIOCONDUCTOR_LIST_URL = 'https://www.bioconductor.org/packages/release/BiocViews.html#___Software'
 44        self.BIOCONDUCTOR_PACKAGE_DATA_URL = 'https://www.bioconductor.org/packages/release/bioc/html/'
 45
 46        # Call the constructor of the parent class
 47        super().__init__(request_handler)
 48
 49
 50
 51    @override
 52    def obtain_package_names(self) -> List[str]:
 53        '''
 54        Get the list of packages from the Bioconductor website
 55
 56        Returns
 57        -------
 58        List[str]
 59            List of package names
 60            
 61        Raises
 62        ------
 63        OliviaFinderException
 64            If the list of packages cannot be obtained
 65            
 66        Example
 67        -------
 68        >>> scraper = BioconductorScraper()
 69        >>> package_names = scraper.obtain_package_names()
 70        '''
 71
 72        # # Make HTTP request to the list of packages page
 73        # # Is necessary to use Selenium because the list of packages is loaded dynamically
 74        # # with JavaScript, we need to render the page to get the list of packages
 75
 76        # # Load the Selenium driver
 77        # driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
 78
 79
 80        # Create the driver
 81        try:
 82            self.logger.debug("Creating the Selenium driver...")
 83
 84            driver_options = webdriver.FirefoxOptions()
 85            driver_options.headless = True
 86            driver = webdriver.Firefox(
 87                options = driver_options
 88            )
 89        except Exception as e:
 90            raise OliviaFinderException("Exception occurred while creating the Selenium driver") from e
 91    
 92        # Scraping webpage with package list
 93        try:
 94            self.logger.debug("Scraping the Bioconductor website...")
 95            driver.get(self.BIOCONDUCTOR_LIST_URL)
 96            table = driver.find_element(By.ID, "biocViews_package_table")
 97            table_content = table.get_attribute("innerHTML")
 98        except Exception as e:
 99            raise OliviaFinderException("Exception occurred while scraping the Bioconductor website") from e
100
101        # Close the driver
102        driver.close()
103
104        # Process the HTML to obtain packages
105        try:
106            self.logger.debug("Processing the HTML...")
107            soup = BeautifulSoup(table_content, 'html.parser')
108            packages = []
109            for row in soup.find_all("tr"):
110                packages.extend(
111                    cell.find("a").text
112                    for cell in row.find_all("td")
113                    if cell.find("a")
114                )
115        except Exception as e:
116            raise OliviaFinderException("Exception occurred while processing the HTML.") from e
117        
118        # Sort the list of packages
119        packages.sort()
120        self.logger.info(f"Obtained {len(packages)} packages from {self.BIOCONDUCTOR_LIST_URL}")
121        
122        return packages
123    
124    @override
125    def _build_url(self, package_name: str) -> str:
126        '''
127        Build the URL of the package page in the Bioconductor website
128
129        Parameters
130        ----------
131        package_name : str
132            The name of the package
133        
134        Returns
135        -------
136        str
137            The URL of the package page in the Bioconductor website
138        '''
139        return f'{self.BIOCONDUCTOR_PACKAGE_DATA_URL}{package_name}.html'
140
141    @override
142    def _parser(self, response: requests.Response) -> Dict[str, str]:
143        '''
144        Parse the response from the Bioconductor website
145        It's obtained from the list of packages in the Bioconductor website
146
147        Parameters
148        ----------
149        response : requests.Response
150            The response from the Bioconductor website
151        
152        Returns
153        -------
154        Dict[str, str]
155            The data of the package
156        '''        
157        # Get the data from the table
158        soup = BeautifulSoup(response.text, 'html.parser')
159
160        name = soup.find('h1').text.strip()
161        url = response.url
162
163        table = soup.find('table', class_='details')
164        rows = table.find_all('tr')
165
166        # For each row, we get the cells if they are of interest
167        dep_list = []
168        imp_list = []
169        for row in rows:
170            cells = row.find_all('td')
171            if len(cells) > 0:
172                if cells[0].text == 'Version':
173                    version = clean_string(cells[1].text.strip())
174                elif cells[0].text == 'Depends':
175                    depends = clean_string(cells[1].text.strip())
176                    if depends != '':
177                        dep_list = r.parse_dependencies(depends)
178                elif cells[0].text == 'Imports':
179                    imports = clean_string(cells[1].text.strip())
180                    if imports != '':
181                        imp_list = r.parse_dependencies(imports)
182                        
183        # Remove duplicates from the dependencies
184        for dep in dep_list:
185            if dep in imp_list:
186                imp_list.remove(dep)
187
188        # Return the data
189        return {
190            'name': name,
191            'version': version,
192            'dependencies': list(dep_list + imp_list),
193            'url': url
194        }
class BioconductorScraper(olivia_finder.data_source.scraper_ds.ScraperDataSource):
 17class BioconductorScraper(ScraperDataSource):
 18    '''
 19    Class to scrape data from Bioconductor packages
 20    
 21    Parameters
 22    ----------
 23
 24    request_handler : RequestHandler = None
 25        Request handler for the scraper, if None, it will be initialized with a generic RequestHandler
 26        
 27    Attributes
 28    ----------
 29    BIOCONDUCTOR_LIST_URL : str
 30        The URL of the page with the list of Bioconductor packages
 31    BIOCONDUCTOR_PACKAGE_DATA_URL : str
 32        The URL of the page with the data of each Bioconductor package
 33    '''
 34    
 35    def __init__(
 36        self, 
 37        request_handler: RequestHandler = None,
 38    ):
 39        '''
 40        Constructor
 41        '''
 42
 43        # Initialize the class variables
 44        self.BIOCONDUCTOR_LIST_URL = 'https://www.bioconductor.org/packages/release/BiocViews.html#___Software'
 45        self.BIOCONDUCTOR_PACKAGE_DATA_URL = 'https://www.bioconductor.org/packages/release/bioc/html/'
 46
 47        # Call the constructor of the parent class
 48        super().__init__(request_handler)
 49
 50
 51
 52    @override
 53    def obtain_package_names(self) -> List[str]:
 54        '''
 55        Get the list of packages from the Bioconductor website
 56
 57        Returns
 58        -------
 59        List[str]
 60            List of package names
 61            
 62        Raises
 63        ------
 64        OliviaFinderException
 65            If the list of packages cannot be obtained
 66            
 67        Example
 68        -------
 69        >>> scraper = BioconductorScraper()
 70        >>> package_names = scraper.obtain_package_names()
 71        '''
 72
 73        # # Make HTTP request to the list of packages page
 74        # # Is necessary to use Selenium because the list of packages is loaded dynamically
 75        # # with JavaScript, we need to render the page to get the list of packages
 76
 77        # # Load the Selenium driver
 78        # driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
 79
 80
 81        # Create the driver
 82        try:
 83            self.logger.debug("Creating the Selenium driver...")
 84
 85            driver_options = webdriver.FirefoxOptions()
 86            driver_options.headless = True
 87            driver = webdriver.Firefox(
 88                options = driver_options
 89            )
 90        except Exception as e:
 91            raise OliviaFinderException("Exception occurred while creating the Selenium driver") from e
 92    
 93        # Scraping webpage with package list
 94        try:
 95            self.logger.debug("Scraping the Bioconductor website...")
 96            driver.get(self.BIOCONDUCTOR_LIST_URL)
 97            table = driver.find_element(By.ID, "biocViews_package_table")
 98            table_content = table.get_attribute("innerHTML")
 99        except Exception as e:
100            raise OliviaFinderException("Exception occurred while scraping the Bioconductor website") from e
101
102        # Close the driver
103        driver.close()
104
105        # Process the HTML to obtain packages
106        try:
107            self.logger.debug("Processing the HTML...")
108            soup = BeautifulSoup(table_content, 'html.parser')
109            packages = []
110            for row in soup.find_all("tr"):
111                packages.extend(
112                    cell.find("a").text
113                    for cell in row.find_all("td")
114                    if cell.find("a")
115                )
116        except Exception as e:
117            raise OliviaFinderException("Exception occurred while processing the HTML.") from e
118        
119        # Sort the list of packages
120        packages.sort()
121        self.logger.info(f"Obtained {len(packages)} packages from {self.BIOCONDUCTOR_LIST_URL}")
122        
123        return packages
124    
125    @override
126    def _build_url(self, package_name: str) -> str:
127        '''
128        Build the URL of the package page in the Bioconductor website
129
130        Parameters
131        ----------
132        package_name : str
133            The name of the package
134        
135        Returns
136        -------
137        str
138            The URL of the package page in the Bioconductor website
139        '''
140        return f'{self.BIOCONDUCTOR_PACKAGE_DATA_URL}{package_name}.html'
141
142    @override
143    def _parser(self, response: requests.Response) -> Dict[str, str]:
144        '''
145        Parse the response from the Bioconductor website
146        It's obtained from the list of packages in the Bioconductor website
147
148        Parameters
149        ----------
150        response : requests.Response
151            The response from the Bioconductor website
152        
153        Returns
154        -------
155        Dict[str, str]
156            The data of the package
157        '''        
158        # Get the data from the table
159        soup = BeautifulSoup(response.text, 'html.parser')
160
161        name = soup.find('h1').text.strip()
162        url = response.url
163
164        table = soup.find('table', class_='details')
165        rows = table.find_all('tr')
166
167        # For each row, we get the cells if they are of interest
168        dep_list = []
169        imp_list = []
170        for row in rows:
171            cells = row.find_all('td')
172            if len(cells) > 0:
173                if cells[0].text == 'Version':
174                    version = clean_string(cells[1].text.strip())
175                elif cells[0].text == 'Depends':
176                    depends = clean_string(cells[1].text.strip())
177                    if depends != '':
178                        dep_list = r.parse_dependencies(depends)
179                elif cells[0].text == 'Imports':
180                    imports = clean_string(cells[1].text.strip())
181                    if imports != '':
182                        imp_list = r.parse_dependencies(imports)
183                        
184        # Remove duplicates from the dependencies
185        for dep in dep_list:
186            if dep in imp_list:
187                imp_list.remove(dep)
188
189        # Return the data
190        return {
191            'name': name,
192            'version': version,
193            'dependencies': list(dep_list + imp_list),
194            'url': url
195        }

Class to scrape data from Bioconductor packages

Parameters
  • request_handler (RequestHandler = None): Request handler for the scraper, if None, it will be initialized with a generic RequestHandler
Attributes
  • BIOCONDUCTOR_LIST_URL (str): The URL of the page with the list of Bioconductor packages
  • BIOCONDUCTOR_PACKAGE_DATA_URL (str): The URL of the page with the data of each Bioconductor package
BioconductorScraper( request_handler: olivia_finder.myrequests.request_handler.RequestHandler = None)
35    def __init__(
36        self, 
37        request_handler: RequestHandler = None,
38    ):
39        '''
40        Constructor
41        '''
42
43        # Initialize the class variables
44        self.BIOCONDUCTOR_LIST_URL = 'https://www.bioconductor.org/packages/release/BiocViews.html#___Software'
45        self.BIOCONDUCTOR_PACKAGE_DATA_URL = 'https://www.bioconductor.org/packages/release/bioc/html/'
46
47        # Call the constructor of the parent class
48        super().__init__(request_handler)

Constructor

@override
def obtain_package_names(self) -> List[str]:
 52    @override
 53    def obtain_package_names(self) -> List[str]:
 54        '''
 55        Get the list of packages from the Bioconductor website
 56
 57        Returns
 58        -------
 59        List[str]
 60            List of package names
 61            
 62        Raises
 63        ------
 64        OliviaFinderException
 65            If the list of packages cannot be obtained
 66            
 67        Example
 68        -------
 69        >>> scraper = BioconductorScraper()
 70        >>> package_names = scraper.obtain_package_names()
 71        '''
 72
 73        # # Make HTTP request to the list of packages page
 74        # # Is necessary to use Selenium because the list of packages is loaded dynamically
 75        # # with JavaScript, we need to render the page to get the list of packages
 76
 77        # # Load the Selenium driver
 78        # driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
 79
 80
 81        # Create the driver
 82        try:
 83            self.logger.debug("Creating the Selenium driver...")
 84
 85            driver_options = webdriver.FirefoxOptions()
 86            driver_options.headless = True
 87            driver = webdriver.Firefox(
 88                options = driver_options
 89            )
 90        except Exception as e:
 91            raise OliviaFinderException("Exception occurred while creating the Selenium driver") from e
 92    
 93        # Scraping webpage with package list
 94        try:
 95            self.logger.debug("Scraping the Bioconductor website...")
 96            driver.get(self.BIOCONDUCTOR_LIST_URL)
 97            table = driver.find_element(By.ID, "biocViews_package_table")
 98            table_content = table.get_attribute("innerHTML")
 99        except Exception as e:
100            raise OliviaFinderException("Exception occurred while scraping the Bioconductor website") from e
101
102        # Close the driver
103        driver.close()
104
105        # Process the HTML to obtain packages
106        try:
107            self.logger.debug("Processing the HTML...")
108            soup = BeautifulSoup(table_content, 'html.parser')
109            packages = []
110            for row in soup.find_all("tr"):
111                packages.extend(
112                    cell.find("a").text
113                    for cell in row.find_all("td")
114                    if cell.find("a")
115                )
116        except Exception as e:
117            raise OliviaFinderException("Exception occurred while processing the HTML.") from e
118        
119        # Sort the list of packages
120        packages.sort()
121        self.logger.info(f"Obtained {len(packages)} packages from {self.BIOCONDUCTOR_LIST_URL}")
122        
123        return packages

Get the list of packages from the Bioconductor website

Returns
  • List[str]: List of package names
Raises
  • OliviaFinderException: If the list of packages cannot be obtained
Example
>>> scraper = BioconductorScraper()
>>> package_names = scraper.obtain_package_names()