olivia_finder.data_source.repository_scrapers.github

  1from typing import List, Optional, Union
  2import requests
  3from typing_extensions import override
  4from bs4 import BeautifulSoup
  5from ..scraper_ds import ScraperDataSource
  6from ...myrequests.request_handler import RequestHandler
  7from ...myrequests.job import RequestJob
  8from ...utilities.exception import OliviaFinderException
  9
 10class GithubScraper(ScraperDataSource):
 11    ''' 
 12    Class that scrapes the Github website to obtain information about dependencies of a repository
 13    Implements the abstract class Scraper and accordingly DataSource class
 14
 15    '''
 16
 17    def __init__(self, request_handler: Optional[RequestHandler] = None):
 18        '''
 19        Constructor
 20
 21        Parameters
 22        ----------
 23        request_handler : RequestHandler = None
 24            Request handler for the scraper, if None, it will be initialized with a generic RequestHandler
 25        '''
 26
 27        super().__init__(request_handler)
 28
 29    def obtain_package_names(self) -> List[str]:
 30        raise NotImplementedError("This method is not implemented")
 31
 32    @override
 33    def obtain_package_data(self, package_name: str) -> Union[dict, None]:
 34        '''
 35        Obtain the list of packages names from the github repository 
 36
 37        Parameters
 38        ----------
 39        pkg_name : str
 40            Name of the package (repository) to scrape
 41
 42        Returns
 43        -------
 44        List[str]
 45            List of packages names
 46            
 47        Examples
 48        --------
 49        >>> from olivia_finder.data_source.repository_scrapers.github import GithubScraper
 50        >>> scraper = GithubScraper()
 51        >>> scraper.obtain_packages_data("
 52        ...     "dab0012/olivia_finder"
 53        ... )
 54        [
 55            {
 56                "name": "dab0012/olivia_finder",
 57                "version": "0.1.0",
 58                "url": "www.github.com/dab0012/olivia_finder"
 59                dependencies: [ ... ]
 60            }
 61        ]
 62
 63        '''
 64
 65        # Build the request job and do the request
 66        url = self._build_url(package_name)
 67        job = self.request_handler.do_request(
 68            RequestJob("Repository packages", url)
 69        )
 70
 71        if job.response is None:
 72            raise OliviaFinderException(f'Error obtaining the list of packages from {url}')
 73        
 74        # Get the list of packages
 75        soup = BeautifulSoup(job.response.text, 'html.parser')
 76
 77        next_page = True
 78        dependencies = {}
 79
 80        # Loop through all pages
 81        while next_page:
 82
 83            # do request and parse
 84            response = requests.get(url=url, timeout=10)
 85            soup = BeautifulSoup(response.content, "html.parser")
 86
 87            # loop through all dependencies
 88            for d in soup.findAll("li", {"class":"Box-row"}):
 89
 90                # Get data and store in the list
 91                # dep_name = d.find("a").text
 92                dep_name = d.find("a")['href'][1:]
 93                dep_version = d.find("span").text
 94                dep_url = f'https://github.com/{dep_name}'
 95
 96                # Clean up data
 97                dep_name = dep_name.replace(" ", "").replace("\n", "")
 98                dep_version = dep_version.replace(" ", "").replace("\n", "")
 99
100
101                dependencies[dep_name] = {
102                    "name": dep_name,
103                    "version": dep_version,
104                    "url": dep_url
105                }
106
107            # Check if next page exists and update url
108            next_page = soup.find("a", {"class":"next_page"}) != None
109            if next_page:
110                url = f"https://github.com{soup.find('a', {'class':'next_page'})['href']}"
111        
112        dep_list = []
113        for dep in dependencies:
114            dep_list.append(dependencies[dep])
115        
116        package = {
117            "name": package_name,
118            "version": "",
119            "url": f"https://github.com/{package_name}",
120            "dependencies": dep_list
121        }
122        
123        return package
124    
125    def _parser(self, response):
126        # TODO: Implement this method        
127        pass
128
129    @override
130    def _build_url(self, repository: str) -> str:
131        '''
132        Build the URL to scrape a package
133        Implements the abstract method of Scraper class
134        Parameters
135        ----------
136        pkg_name : str
137            Name of the package
138
139        Returns
140        -------
141        str
142            URL to scrape
143        '''
144
145        return f"https://github.com/{repository}/network/dependencies"
class GithubScraper(olivia_finder.data_source.scraper_ds.ScraperDataSource):
 11class GithubScraper(ScraperDataSource):
 12    ''' 
 13    Class that scrapes the Github website to obtain information about dependencies of a repository
 14    Implements the abstract class Scraper and accordingly DataSource class
 15
 16    '''
 17
 18    def __init__(self, request_handler: Optional[RequestHandler] = None):
 19        '''
 20        Constructor
 21
 22        Parameters
 23        ----------
 24        request_handler : RequestHandler = None
 25            Request handler for the scraper, if None, it will be initialized with a generic RequestHandler
 26        '''
 27
 28        super().__init__(request_handler)
 29
 30    def obtain_package_names(self) -> List[str]:
 31        raise NotImplementedError("This method is not implemented")
 32
 33    @override
 34    def obtain_package_data(self, package_name: str) -> Union[dict, None]:
 35        '''
 36        Obtain the list of packages names from the github repository 
 37
 38        Parameters
 39        ----------
 40        pkg_name : str
 41            Name of the package (repository) to scrape
 42
 43        Returns
 44        -------
 45        List[str]
 46            List of packages names
 47            
 48        Examples
 49        --------
 50        >>> from olivia_finder.data_source.repository_scrapers.github import GithubScraper
 51        >>> scraper = GithubScraper()
 52        >>> scraper.obtain_packages_data("
 53        ...     "dab0012/olivia_finder"
 54        ... )
 55        [
 56            {
 57                "name": "dab0012/olivia_finder",
 58                "version": "0.1.0",
 59                "url": "www.github.com/dab0012/olivia_finder"
 60                dependencies: [ ... ]
 61            }
 62        ]
 63
 64        '''
 65
 66        # Build the request job and do the request
 67        url = self._build_url(package_name)
 68        job = self.request_handler.do_request(
 69            RequestJob("Repository packages", url)
 70        )
 71
 72        if job.response is None:
 73            raise OliviaFinderException(f'Error obtaining the list of packages from {url}')
 74        
 75        # Get the list of packages
 76        soup = BeautifulSoup(job.response.text, 'html.parser')
 77
 78        next_page = True
 79        dependencies = {}
 80
 81        # Loop through all pages
 82        while next_page:
 83
 84            # do request and parse
 85            response = requests.get(url=url, timeout=10)
 86            soup = BeautifulSoup(response.content, "html.parser")
 87
 88            # loop through all dependencies
 89            for d in soup.findAll("li", {"class":"Box-row"}):
 90
 91                # Get data and store in the list
 92                # dep_name = d.find("a").text
 93                dep_name = d.find("a")['href'][1:]
 94                dep_version = d.find("span").text
 95                dep_url = f'https://github.com/{dep_name}'
 96
 97                # Clean up data
 98                dep_name = dep_name.replace(" ", "").replace("\n", "")
 99                dep_version = dep_version.replace(" ", "").replace("\n", "")
100
101
102                dependencies[dep_name] = {
103                    "name": dep_name,
104                    "version": dep_version,
105                    "url": dep_url
106                }
107
108            # Check if next page exists and update url
109            next_page = soup.find("a", {"class":"next_page"}) != None
110            if next_page:
111                url = f"https://github.com{soup.find('a', {'class':'next_page'})['href']}"
112        
113        dep_list = []
114        for dep in dependencies:
115            dep_list.append(dependencies[dep])
116        
117        package = {
118            "name": package_name,
119            "version": "",
120            "url": f"https://github.com/{package_name}",
121            "dependencies": dep_list
122        }
123        
124        return package
125    
126    def _parser(self, response):
127        # TODO: Implement this method        
128        pass
129
130    @override
131    def _build_url(self, repository: str) -> str:
132        '''
133        Build the URL to scrape a package
134        Implements the abstract method of Scraper class
135        Parameters
136        ----------
137        pkg_name : str
138            Name of the package
139
140        Returns
141        -------
142        str
143            URL to scrape
144        '''
145
146        return f"https://github.com/{repository}/network/dependencies"

Class that scrapes the Github website to obtain information about dependencies of a repository Implements the abstract class Scraper and accordingly DataSource class

GithubScraper( request_handler: Optional[olivia_finder.myrequests.request_handler.RequestHandler] = None)
18    def __init__(self, request_handler: Optional[RequestHandler] = None):
19        '''
20        Constructor
21
22        Parameters
23        ----------
24        request_handler : RequestHandler = None
25            Request handler for the scraper, if None, it will be initialized with a generic RequestHandler
26        '''
27
28        super().__init__(request_handler)

Constructor

Parameters
  • request_handler (RequestHandler = None): Request handler for the scraper, if None, it will be initialized with a generic RequestHandler
def obtain_package_names(self) -> List[str]:
30    def obtain_package_names(self) -> List[str]:
31        raise NotImplementedError("This method is not implemented")

Obtain the package names from the web page of the package manager it must handle exceptions and return an empty list if the package names cannot be obtained To be implemented by the child class

Raises
  • NotImplementedError: Bcause the method is not implemented in the base class
@override
def obtain_package_data(self, package_name: str) -> Optional[dict]:
 33    @override
 34    def obtain_package_data(self, package_name: str) -> Union[dict, None]:
 35        '''
 36        Obtain the list of packages names from the github repository 
 37
 38        Parameters
 39        ----------
 40        pkg_name : str
 41            Name of the package (repository) to scrape
 42
 43        Returns
 44        -------
 45        List[str]
 46            List of packages names
 47            
 48        Examples
 49        --------
 50        >>> from olivia_finder.data_source.repository_scrapers.github import GithubScraper
 51        >>> scraper = GithubScraper()
 52        >>> scraper.obtain_packages_data("
 53        ...     "dab0012/olivia_finder"
 54        ... )
 55        [
 56            {
 57                "name": "dab0012/olivia_finder",
 58                "version": "0.1.0",
 59                "url": "www.github.com/dab0012/olivia_finder"
 60                dependencies: [ ... ]
 61            }
 62        ]
 63
 64        '''
 65
 66        # Build the request job and do the request
 67        url = self._build_url(package_name)
 68        job = self.request_handler.do_request(
 69            RequestJob("Repository packages", url)
 70        )
 71
 72        if job.response is None:
 73            raise OliviaFinderException(f'Error obtaining the list of packages from {url}')
 74        
 75        # Get the list of packages
 76        soup = BeautifulSoup(job.response.text, 'html.parser')
 77
 78        next_page = True
 79        dependencies = {}
 80
 81        # Loop through all pages
 82        while next_page:
 83
 84            # do request and parse
 85            response = requests.get(url=url, timeout=10)
 86            soup = BeautifulSoup(response.content, "html.parser")
 87
 88            # loop through all dependencies
 89            for d in soup.findAll("li", {"class":"Box-row"}):
 90
 91                # Get data and store in the list
 92                # dep_name = d.find("a").text
 93                dep_name = d.find("a")['href'][1:]
 94                dep_version = d.find("span").text
 95                dep_url = f'https://github.com/{dep_name}'
 96
 97                # Clean up data
 98                dep_name = dep_name.replace(" ", "").replace("\n", "")
 99                dep_version = dep_version.replace(" ", "").replace("\n", "")
100
101
102                dependencies[dep_name] = {
103                    "name": dep_name,
104                    "version": dep_version,
105                    "url": dep_url
106                }
107
108            # Check if next page exists and update url
109            next_page = soup.find("a", {"class":"next_page"}) != None
110            if next_page:
111                url = f"https://github.com{soup.find('a', {'class':'next_page'})['href']}"
112        
113        dep_list = []
114        for dep in dependencies:
115            dep_list.append(dependencies[dep])
116        
117        package = {
118            "name": package_name,
119            "version": "",
120            "url": f"https://github.com/{package_name}",
121            "dependencies": dep_list
122        }
123        
124        return package

Obtain the list of packages names from the github repository

Parameters
  • pkg_name (str): Name of the package (repository) to scrape
Returns
  • List[str]: List of packages names
Examples
>>> from olivia_finder.data_source.repository_scrapers.github import GithubScraper
>>> scraper = GithubScraper()
>>> scraper.obtain_packages_data("
...     "dab0012/olivia_finder"
... )
[
    {
        "name": "dab0012/olivia_finder",
        "version": "0.1.0",
        "url": "www.github.com/dab0012/olivia_finder"
        dependencies: [ ... ]
    }
]