olivia_finder.myrequests.useragent_handler
Remove the @singleton decorator before documenting the class Doc fails with the @singleton decorator
1""" 2 3.. danger:: 4 5 Remove the @singleton decorator before documenting the class 6 Doc fails with the @singleton decorator 7 8""" 9 10import os 11from threading import Lock 12import random 13from typing import List 14import requests 15from bs4 import BeautifulSoup 16from ..utilities.logger import MyLogger 17from ..utilities.singleton_decorator import singleton 18from ..utilities.config import Configuration 19 20 21@singleton 22class UserAgentHandler(): 23 ''' 24 UserAgentHandler class 25 26 This class is a singleton class that handles the user agents used by the requests 27 to the different data sources. It can load the user agents from a file or from the 28 useragentstring.com API. 29 30 If the user agents are loaded from a file, the file must be located in the data 31 folder of the package and must be named useragents.txt. The file must contain one 32 user agent per line. 33 34 If the user agents are loaded from the useragentstring.com API, the user agents 35 are loaded from the URL specified in the class variable USERAGENTSTRING_URL. The 36 user agents are loaded from the page https://www.useragentstring.com/pages/useragentstring.php?name=All 37 and the user agents are extracted from the table in the page. 38 39 40 Attributes 41 ---------- 42 USERAGENTSTRING_URL : str 43 The URL of the page with the list of user agents 44 DATA_FILE : str 45 The path to the file with the user agents 46 useragents_list : List[str] 47 The list of user agents loaded from the file or from the useragentstring.com API, 48 default is an empty list that will be filled with the default user agents if the 49 user agents cannot be loaded from the external sources 50 ''' 51 52 53 54 USERAGENTSTRING_URL = 'https://www.useragentstring.com/pages/useragentstring.php?name=All' 55 DATA_FILE: str 56 useragents_list: List[str] 57 58 def __init__(self, use_file: bool = True) -> None: 59 ''' 60 Constructor 61 62 Parameters 63 ---------- 64 use_file : bool = True 65 If True, the user agents are loaded from the file specified in the class variable DATA_FILE. 66 If False, the user agents are loaded from the useragentstring.com API. 67 68 ''' 69 70 # Get logger name from config file 71 self.logger = MyLogger.get_logger("logger_myrequests") 72 73 # Lock to prevent concurrent access to the proxy list 74 self.lock = Lock() 75 76 # Initialize the list before loading the user agents 77 self.useragents_list = [] 78 79 # Load user agents from file 80 if use_file: 81 # get the data file path 82 current_file_path = os.path.abspath(__file__) 83 self.DATA_FILE = os.path.join(os.path.dirname(current_file_path), 'data', 'useragents.txt') 84 85 if self._load_from_file(self.DATA_FILE): 86 self.logger.info(f"Useragents loaded from file: {self.DATA_FILE}") 87 return 88 89 # Load user agents from the useragentstring.com API 90 if self._load_from_useragentstring(): 91 self.logger.info(f"Useragents loaded from USERAGENTSTRING_URL: {self.USERAGENTSTRING_URL}") 92 return 93 94 # If at this time there are no uses available, the default useragents are loaded using the list 95 # hardcodeted below 96 self.useragents_list = [ 97 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36', 98 'Mozilla/5.0 (iPhone12,1; U; CPU iPhone OS 13_0 like Mac OS X) AppleWebKit/602.1.50 (KHTML, like Gecko) Version/10.0 Mobile/15E148 Safari/602.1', 99 'Mozilla/5.0 (Linux; Android 10; SM-G975F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.181 Mobile Safari/537.36', 100 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/111.0', 101 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0', 102 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15' 103 ] 104 self.logger.info("Useragents list is empty. Using default useragents") 105 106 def _load_from_file(self, file_path:str) -> bool: 107 ''' 108 Load user agents from a file, one user agent per line, and save them in the user agent list 109 110 Parameters 111 ---------- 112 file_path : str 113 Path to the file containing the user agents 114 115 Returns 116 ------- 117 bool 118 True if the user agents were loaded correctly, False otherwise 119 120 Examples 121 -------- 122 >>> useragent_handler = UserAgentHandler() 123 >>> useragent_handler._load_from_file('data/useragents.txt') 124 True 125 ''' 126 127 if file_path is None: 128 return False 129 130 try: 131 with open(file_path, 'r') as f: 132 self.useragents_list = f.read().split('\n') 133 return True 134 135 except FileNotFoundError: 136 self.logger.warning(f"Useragents file not found: {file_path}") 137 return False 138 139 def _load_from_useragentstring(self) -> bool: 140 141 ''' 142 Get user agents from the useragentstring.com API and save them in the user agent list 143 144 Parameters 145 ---------- 146 max_count : int, optional 147 Maximum number of user agents to be obtained, by default 30 148 149 Returns 150 ------- 151 bool 152 True if the user agents were obtained correctly, False otherwise 153 ''' 154 155 self.logger.debug(f"Getting user agents from API: {self.USERAGENTSTRING_URL}") 156 157 # Get user agents from the API 158 try: 159 user_agents_request = requests.get(self.USERAGENTSTRING_URL, timeout=60).text 160 except Exception as e: 161 self.logger.debug(f"Error getting user agents from API: {self.USERAGENTSTRING_URL}") 162 self.logger.debug(f"Error: {e}") 163 return False 164 165 # Parse the HTML 166 try: 167 soup = BeautifulSoup(user_agents_request, 'html.parser') 168 div = soup.find(id="liste") # Find the div element with id = liste 169 lis = div.find_all("li") # type: ignore # Search for all li elements within the div element 170 171 # Stores user agents in a list 172 for li in lis: 173 # Add the user agent to the list (cleaning the blanks from the string) 174 ua = li.text.strip() 175 self.useragents_list.append(ua) 176 return True 177 178 except Exception as e: 179 self.logger.warning(f"Error parsing user agents from API: {self.USERAGENTSTRING_URL}") 180 self.logger.warning(f"Error: {e}") 181 return False 182 183 def get_next_useragent(self) -> str: 184 ''' 185 Returns a random useragent from the list, if the list is empty, returns a default useragent 186 187 Returns 188 ------- 189 190 str 191 A random useragent 192 ''' 193 194 # If the list is empty, return a default useragent (THIS SHOULD NOT HAPPEN) 195 if len(self.useragents_list) == 0: 196 self.logger.warning("Useragents list is empty") 197 return "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" 198 199 index = random.randint(0, len(self.useragents_list) - 1) 200 self.logger.debug(f"Next useragent: {self.useragents_list[index]}") 201 return self.useragents_list[index]