olivia_finder.data_source.repository_scrapers.github
1from typing import List, Optional, Union 2import requests 3from typing_extensions import override 4from bs4 import BeautifulSoup 5from ..scraper_ds import ScraperDataSource 6from ...myrequests.request_handler import RequestHandler 7from ...myrequests.job import RequestJob 8from ...utilities.exception import OliviaFinderException 9 10class GithubScraper(ScraperDataSource): 11 ''' 12 Class that scrapes the Github website to obtain information about dependencies of a repository 13 Implements the abstract class Scraper and accordingly DataSource class 14 15 ''' 16 17 def __init__(self, request_handler: Optional[RequestHandler] = None): 18 ''' 19 Constructor 20 21 Parameters 22 ---------- 23 request_handler : RequestHandler = None 24 Request handler for the scraper, if None, it will be initialized with a generic RequestHandler 25 ''' 26 27 super().__init__(request_handler) 28 29 def obtain_package_names(self) -> List[str]: 30 raise NotImplementedError("This method is not implemented") 31 32 @override 33 def obtain_package_data(self, package_name: str) -> Union[dict, None]: 34 ''' 35 Obtain the list of packages names from the github repository 36 37 Parameters 38 ---------- 39 pkg_name : str 40 Name of the package (repository) to scrape 41 42 Returns 43 ------- 44 List[str] 45 List of packages names 46 47 Examples 48 -------- 49 >>> from olivia_finder.data_source.repository_scrapers.github import GithubScraper 50 >>> scraper = GithubScraper() 51 >>> scraper.obtain_packages_data(" 52 ... "dab0012/olivia_finder" 53 ... ) 54 [ 55 { 56 "name": "dab0012/olivia_finder", 57 "version": "0.1.0", 58 "url": "www.github.com/dab0012/olivia_finder" 59 dependencies: [ ... ] 60 } 61 ] 62 63 ''' 64 65 # Build the request job and do the request 66 url = self._build_url(package_name) 67 job = self.request_handler.do_request( 68 RequestJob("Repository packages", url) 69 ) 70 71 if job.response is None: 72 raise OliviaFinderException(f'Error obtaining the list of packages from {url}') 73 74 # Get the list of packages 75 soup = BeautifulSoup(job.response.text, 'html.parser') 76 77 next_page = True 78 dependencies = {} 79 80 # Loop through all pages 81 while next_page: 82 83 # do request and parse 84 response = requests.get(url=url, timeout=10) 85 soup = BeautifulSoup(response.content, "html.parser") 86 87 # loop through all dependencies 88 for d in soup.findAll("li", {"class":"Box-row"}): 89 90 # Get data and store in the list 91 # dep_name = d.find("a").text 92 dep_name = d.find("a")['href'][1:] 93 dep_version = d.find("span").text 94 dep_url = f'https://github.com/{dep_name}' 95 96 # Clean up data 97 dep_name = dep_name.replace(" ", "").replace("\n", "") 98 dep_version = dep_version.replace(" ", "").replace("\n", "") 99 100 101 dependencies[dep_name] = { 102 "name": dep_name, 103 "version": dep_version, 104 "url": dep_url 105 } 106 107 # Check if next page exists and update url 108 next_page = soup.find("a", {"class":"next_page"}) != None 109 if next_page: 110 url = f"https://github.com{soup.find('a', {'class':'next_page'})['href']}" 111 112 dep_list = [] 113 for dep in dependencies: 114 dep_list.append(dependencies[dep]) 115 116 package = { 117 "name": package_name, 118 "version": "", 119 "url": f"https://github.com/{package_name}", 120 "dependencies": dep_list 121 } 122 123 return package 124 125 def _parser(self, response): 126 # TODO: Implement this method 127 pass 128 129 @override 130 def _build_url(self, repository: str) -> str: 131 ''' 132 Build the URL to scrape a package 133 Implements the abstract method of Scraper class 134 Parameters 135 ---------- 136 pkg_name : str 137 Name of the package 138 139 Returns 140 ------- 141 str 142 URL to scrape 143 ''' 144 145 return f"https://github.com/{repository}/network/dependencies"
11class GithubScraper(ScraperDataSource): 12 ''' 13 Class that scrapes the Github website to obtain information about dependencies of a repository 14 Implements the abstract class Scraper and accordingly DataSource class 15 16 ''' 17 18 def __init__(self, request_handler: Optional[RequestHandler] = None): 19 ''' 20 Constructor 21 22 Parameters 23 ---------- 24 request_handler : RequestHandler = None 25 Request handler for the scraper, if None, it will be initialized with a generic RequestHandler 26 ''' 27 28 super().__init__(request_handler) 29 30 def obtain_package_names(self) -> List[str]: 31 raise NotImplementedError("This method is not implemented") 32 33 @override 34 def obtain_package_data(self, package_name: str) -> Union[dict, None]: 35 ''' 36 Obtain the list of packages names from the github repository 37 38 Parameters 39 ---------- 40 pkg_name : str 41 Name of the package (repository) to scrape 42 43 Returns 44 ------- 45 List[str] 46 List of packages names 47 48 Examples 49 -------- 50 >>> from olivia_finder.data_source.repository_scrapers.github import GithubScraper 51 >>> scraper = GithubScraper() 52 >>> scraper.obtain_packages_data(" 53 ... "dab0012/olivia_finder" 54 ... ) 55 [ 56 { 57 "name": "dab0012/olivia_finder", 58 "version": "0.1.0", 59 "url": "www.github.com/dab0012/olivia_finder" 60 dependencies: [ ... ] 61 } 62 ] 63 64 ''' 65 66 # Build the request job and do the request 67 url = self._build_url(package_name) 68 job = self.request_handler.do_request( 69 RequestJob("Repository packages", url) 70 ) 71 72 if job.response is None: 73 raise OliviaFinderException(f'Error obtaining the list of packages from {url}') 74 75 # Get the list of packages 76 soup = BeautifulSoup(job.response.text, 'html.parser') 77 78 next_page = True 79 dependencies = {} 80 81 # Loop through all pages 82 while next_page: 83 84 # do request and parse 85 response = requests.get(url=url, timeout=10) 86 soup = BeautifulSoup(response.content, "html.parser") 87 88 # loop through all dependencies 89 for d in soup.findAll("li", {"class":"Box-row"}): 90 91 # Get data and store in the list 92 # dep_name = d.find("a").text 93 dep_name = d.find("a")['href'][1:] 94 dep_version = d.find("span").text 95 dep_url = f'https://github.com/{dep_name}' 96 97 # Clean up data 98 dep_name = dep_name.replace(" ", "").replace("\n", "") 99 dep_version = dep_version.replace(" ", "").replace("\n", "") 100 101 102 dependencies[dep_name] = { 103 "name": dep_name, 104 "version": dep_version, 105 "url": dep_url 106 } 107 108 # Check if next page exists and update url 109 next_page = soup.find("a", {"class":"next_page"}) != None 110 if next_page: 111 url = f"https://github.com{soup.find('a', {'class':'next_page'})['href']}" 112 113 dep_list = [] 114 for dep in dependencies: 115 dep_list.append(dependencies[dep]) 116 117 package = { 118 "name": package_name, 119 "version": "", 120 "url": f"https://github.com/{package_name}", 121 "dependencies": dep_list 122 } 123 124 return package 125 126 def _parser(self, response): 127 # TODO: Implement this method 128 pass 129 130 @override 131 def _build_url(self, repository: str) -> str: 132 ''' 133 Build the URL to scrape a package 134 Implements the abstract method of Scraper class 135 Parameters 136 ---------- 137 pkg_name : str 138 Name of the package 139 140 Returns 141 ------- 142 str 143 URL to scrape 144 ''' 145 146 return f"https://github.com/{repository}/network/dependencies"
Class that scrapes the Github website to obtain information about dependencies of a repository Implements the abstract class Scraper and accordingly DataSource class
GithubScraper( request_handler: Optional[olivia_finder.myrequests.request_handler.RequestHandler] = None)
18 def __init__(self, request_handler: Optional[RequestHandler] = None): 19 ''' 20 Constructor 21 22 Parameters 23 ---------- 24 request_handler : RequestHandler = None 25 Request handler for the scraper, if None, it will be initialized with a generic RequestHandler 26 ''' 27 28 super().__init__(request_handler)
Constructor
Parameters
- request_handler (RequestHandler = None): Request handler for the scraper, if None, it will be initialized with a generic RequestHandler
def
obtain_package_names(self) -> List[str]:
30 def obtain_package_names(self) -> List[str]: 31 raise NotImplementedError("This method is not implemented")
Obtain the package names from the web page of the package manager it must handle exceptions and return an empty list if the package names cannot be obtained To be implemented by the child class
Raises
- NotImplementedError: Bcause the method is not implemented in the base class
@override
def
obtain_package_data(self, package_name: str) -> Optional[dict]:
33 @override 34 def obtain_package_data(self, package_name: str) -> Union[dict, None]: 35 ''' 36 Obtain the list of packages names from the github repository 37 38 Parameters 39 ---------- 40 pkg_name : str 41 Name of the package (repository) to scrape 42 43 Returns 44 ------- 45 List[str] 46 List of packages names 47 48 Examples 49 -------- 50 >>> from olivia_finder.data_source.repository_scrapers.github import GithubScraper 51 >>> scraper = GithubScraper() 52 >>> scraper.obtain_packages_data(" 53 ... "dab0012/olivia_finder" 54 ... ) 55 [ 56 { 57 "name": "dab0012/olivia_finder", 58 "version": "0.1.0", 59 "url": "www.github.com/dab0012/olivia_finder" 60 dependencies: [ ... ] 61 } 62 ] 63 64 ''' 65 66 # Build the request job and do the request 67 url = self._build_url(package_name) 68 job = self.request_handler.do_request( 69 RequestJob("Repository packages", url) 70 ) 71 72 if job.response is None: 73 raise OliviaFinderException(f'Error obtaining the list of packages from {url}') 74 75 # Get the list of packages 76 soup = BeautifulSoup(job.response.text, 'html.parser') 77 78 next_page = True 79 dependencies = {} 80 81 # Loop through all pages 82 while next_page: 83 84 # do request and parse 85 response = requests.get(url=url, timeout=10) 86 soup = BeautifulSoup(response.content, "html.parser") 87 88 # loop through all dependencies 89 for d in soup.findAll("li", {"class":"Box-row"}): 90 91 # Get data and store in the list 92 # dep_name = d.find("a").text 93 dep_name = d.find("a")['href'][1:] 94 dep_version = d.find("span").text 95 dep_url = f'https://github.com/{dep_name}' 96 97 # Clean up data 98 dep_name = dep_name.replace(" ", "").replace("\n", "") 99 dep_version = dep_version.replace(" ", "").replace("\n", "") 100 101 102 dependencies[dep_name] = { 103 "name": dep_name, 104 "version": dep_version, 105 "url": dep_url 106 } 107 108 # Check if next page exists and update url 109 next_page = soup.find("a", {"class":"next_page"}) != None 110 if next_page: 111 url = f"https://github.com{soup.find('a', {'class':'next_page'})['href']}" 112 113 dep_list = [] 114 for dep in dependencies: 115 dep_list.append(dependencies[dep]) 116 117 package = { 118 "name": package_name, 119 "version": "", 120 "url": f"https://github.com/{package_name}", 121 "dependencies": dep_list 122 } 123 124 return package
Obtain the list of packages names from the github repository
Parameters
- pkg_name (str): Name of the package (repository) to scrape
Returns
- List[str]: List of packages names
Examples
>>> from olivia_finder.data_source.repository_scrapers.github import GithubScraper
>>> scraper = GithubScraper()
>>> scraper.obtain_packages_data("
... "dab0012/olivia_finder"
... )
[
{
"name": "dab0012/olivia_finder",
"version": "0.1.0",
"url": "www.github.com/dab0012/olivia_finder"
dependencies: [ ... ]
}
]