olivia_finder.myrequests.useragent_handler

Remove the @singleton decorator before documenting the class Doc fails with the @singleton decorator

  1"""
  2
  3.. danger:: 
  4
  5    Remove the @singleton decorator before documenting the class
  6    Doc fails with the @singleton decorator
  7
  8"""
  9
 10import os
 11from threading import Lock
 12import random
 13from typing import List
 14import requests
 15from bs4 import BeautifulSoup
 16from ..utilities.logger import MyLogger
 17from ..utilities.singleton_decorator import singleton
 18from ..utilities.config import Configuration
 19
 20
 21@singleton
 22class UserAgentHandler():
 23    '''
 24    UserAgentHandler class
 25
 26    This class is a singleton class that handles the user agents used by the requests
 27    to the different data sources. It can load the user agents from a file or from the
 28    useragentstring.com API. 
 29    
 30    If the user agents are loaded from a file, the file must be located in the data
 31    folder of the package and must be named useragents.txt. The file must contain one
 32    user agent per line.
 33
 34    If the user agents are loaded from the useragentstring.com API, the user agents
 35    are loaded from the URL specified in the class variable USERAGENTSTRING_URL. The
 36    user agents are loaded from the page https://www.useragentstring.com/pages/useragentstring.php?name=All
 37    and the user agents are extracted from the table in the page. 
 38
 39    
 40    Attributes
 41    ----------
 42    USERAGENTSTRING_URL : str
 43        The URL of the page with the list of user agents
 44    DATA_FILE : str
 45        The path to the file with the user agents
 46    useragents_list : List[str]
 47        The list of user agents loaded from the file or from the useragentstring.com API,
 48        default is an empty list that will be filled with the default user agents if the
 49        user agents cannot be loaded from the external sources
 50    '''
 51
 52   
 53    
 54    USERAGENTSTRING_URL = 'https://www.useragentstring.com/pages/useragentstring.php?name=All'
 55    DATA_FILE: str
 56    useragents_list: List[str]
 57
 58    def __init__(self, use_file: bool = True) -> None:
 59        '''
 60        Constructor
 61
 62        Parameters
 63        ----------
 64        use_file : bool = True
 65            If True, the user agents are loaded from the file specified in the class variable DATA_FILE.
 66            If False, the user agents are loaded from the useragentstring.com API.
 67
 68        '''
 69
 70        # Get logger name from config file
 71        self.logger = MyLogger.get_logger("logger_myrequests")
 72
 73        # Lock to prevent concurrent access to the proxy list
 74        self.lock = Lock()
 75
 76        # Initialize the list before loading the user agents
 77        self.useragents_list = []
 78
 79        # Load user agents from file
 80        if use_file:
 81            # get the data file path
 82            current_file_path =  os.path.abspath(__file__)
 83            self.DATA_FILE = os.path.join(os.path.dirname(current_file_path), 'data', 'useragents.txt')
 84
 85            if self._load_from_file(self.DATA_FILE):
 86                self.logger.info(f"Useragents loaded from file: {self.DATA_FILE}")
 87                return
 88
 89        # Load user agents from the useragentstring.com API
 90        if self._load_from_useragentstring():
 91            self.logger.info(f"Useragents loaded from USERAGENTSTRING_URL: {self.USERAGENTSTRING_URL}")
 92            return
 93
 94        # If at this time there are no uses available, the default useragents are loaded using the list
 95        # hardcodeted below
 96        self.useragents_list = [
 97            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36',
 98            'Mozilla/5.0 (iPhone12,1; U; CPU iPhone OS 13_0 like Mac OS X) AppleWebKit/602.1.50 (KHTML, like Gecko) Version/10.0 Mobile/15E148 Safari/602.1',
 99            'Mozilla/5.0 (Linux; Android 10; SM-G975F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.181 Mobile Safari/537.36',
100            'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/111.0',
101            'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0',
102            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15'
103        ]
104        self.logger.info("Useragents list is empty. Using default useragents")
105
106    def _load_from_file(self, file_path:str) -> bool:
107        '''
108        Load user agents from a file, one user agent per line, and save them in the user agent list
109
110        Parameters
111        ----------
112        file_path : str
113            Path to the file containing the user agents
114
115        Returns
116        -------
117        bool
118            True if the user agents were loaded correctly, False otherwise
119
120        Examples
121        --------
122        >>> useragent_handler = UserAgentHandler()
123        >>> useragent_handler._load_from_file('data/useragents.txt')
124            True
125        '''
126
127        if file_path is None:
128            return False
129
130        try:
131            with open(file_path, 'r') as f:
132                self.useragents_list = f.read().split('\n')
133                return True
134            
135        except FileNotFoundError:
136            self.logger.warning(f"Useragents file not found: {file_path}")
137            return False
138    
139    def _load_from_useragentstring(self) -> bool:
140        
141        '''
142        Get user agents from the useragentstring.com API and save them in the user agent list
143
144        Parameters
145        ----------
146        max_count : int, optional
147            Maximum number of user agents to be obtained, by default 30
148
149        Returns
150        -------
151        bool
152            True if the user agents were obtained correctly, False otherwise
153        '''
154
155        self.logger.debug(f"Getting user agents from API: {self.USERAGENTSTRING_URL}")
156
157        # Get user agents from the API
158        try:
159            user_agents_request = requests.get(self.USERAGENTSTRING_URL, timeout=60).text
160        except Exception as e:
161            self.logger.debug(f"Error getting user agents from API: {self.USERAGENTSTRING_URL}")
162            self.logger.debug(f"Error: {e}")
163            return False
164        
165        # Parse the HTML
166        try:
167            soup = BeautifulSoup(user_agents_request, 'html.parser')
168            div = soup.find(id="liste")     # Find the div element with id = liste
169            lis = div.find_all("li")        # type: ignore # Search for all li elements within the div element
170
171            # Stores user agents in a list
172            for li in lis:
173                # Add the user agent to the list (cleaning the blanks from the string)     
174                ua = li.text.strip()
175                self.useragents_list.append(ua)
176            return True
177        
178        except Exception as e:
179            self.logger.warning(f"Error parsing user agents from API: {self.USERAGENTSTRING_URL}")
180            self.logger.warning(f"Error: {e}")
181            return False
182
183    def get_next_useragent(self) -> str:
184        '''
185        Returns a random useragent from the list, if the list is empty, returns a default useragent
186
187        Returns
188        -------
189
190        str
191            A random useragent
192        '''
193
194        # If the list is empty, return a default useragent (THIS SHOULD NOT HAPPEN)
195        if len(self.useragents_list) == 0:
196            self.logger.warning("Useragents list is empty")
197            return "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
198
199        index = random.randint(0, len(self.useragents_list) - 1)
200        self.logger.debug(f"Next useragent: {self.useragents_list[index]}")
201        return self.useragents_list[index]