olivia_finder.data_source.csv_ds

  1from __future__ import annotations
  2import os
  3from typing import List, Optional
  4import pandas as pd
  5import tqdm
  6from .data_source import DataSource
  7
  8class CSVDataSource(DataSource):
  9    """
 10    Class that implements the methods for loading a network from a CSV file.
 11    Implements the **DataSource** abstract class.
 12    
 13    """
 14    
 15    def __init__(
 16        self,
 17        file_path: str, 
 18        dependent_field: Optional[str] = None,
 19        dependency_field: Optional[str] = None,
 20        dependent_version_field: Optional[str] = None,
 21        dependency_version_field: Optional[str] = None,
 22        dependent_url_field: Optional[str] = None
 23    ):
 24        """
 25        Constructor of the class
 26
 27        Parameters
 28        ----------
 29        file_path : str
 30            The path to the CSV file
 31        dependent_field : str, optional
 32            The name of the field that contains the dependent packages, by default None
 33        dependency_field : str, optional
 34            The name of the field that contains the dependency packages, by default None
 35        dependent_version_field : str, optional
 36            The name of the field that contains the dependent packages versions, by default None
 37        dependency_version_field : str, optional
 38            The name of the field that contains the dependency packages versions, by default None
 39        dependent_url_field : str, optional
 40            The name of the field that contains the dependent packages urls, by default None
 41
 42        Raises
 43        ------
 44        ValueError
 45            If the file path is None, If the file is not a CSV file, If the dependent field is None,
 46        """
 47
 48        # Set the dataframe as None and the fields
 49        self.data: Optional[pd.DataFrame] = None
 50        self.dependent_field = dependent_field
 51        self.dependency_field = dependency_field
 52        self.dependent_version_field = dependent_version_field
 53        self.dependency_version_field = dependency_version_field
 54        self.dependent_url_field = dependent_url_field
 55        self.file_path = file_path
 56
 57        # Initialize the logger
 58        super().__init__()
 59
 60        # Load the data if the file path is setted
 61        if self.file_path is not None:
 62            self._load_data()
 63        else:
 64            self.logger.debug("File path is None. Data not loaded.")
 65            raise ValueError("File path cannot be None.")
 66
 67    def _load_data(self):
 68        """
 69        Loads the data from a CSV file like [name,version,url,dependency,dependency_version]
 70        The dependent_version_field and dependent_url_field parameters are optional
 71
 72        Parameters
 73        ----------
 74        file_path : str
 75            The path to the CSV file
 76            
 77        Raises
 78        ------
 79        FileNotFoundError: Exception
 80            If the file does not exist
 81        ValueError: Exception
 82            If the file path is None, If the file is not a CSV file, If the dependent field is None, 
 83            If the dependency field is None, If the dependent field and dependency field are the same
 84        """
 85
 86        # Check the file is valid
 87        if self.file_path is None:
 88            raise ValueError("File path cannot be None.")
 89        
 90        if not os.path.exists(self.file_path):
 91            raise FileNotFoundError(f"File {self.file_path} not found.")
 92        
 93        if not self.file_path.endswith(".csv"):
 94            raise ValueError(f"File {self.file_path} is not a CSV file.")
 95        
 96        # Check if the mandatory fields are setted and are valid
 97        if self.dependent_field is None:
 98            raise ValueError("Dependent field cannot be None.")
 99        
100        if self.dependency_field is None:
101            raise ValueError("Dependency field cannot be None.")
102        
103        if self.dependent_field == self.dependency_field:
104            raise ValueError("Dependent field and dependency field cannot be the same.")
105        
106        # Load the data
107        self.data = pd.read_csv(self.file_path)
108        
109        # Mandatory fields
110        if self.dependent_field not in self.data.columns:
111            raise ValueError(f"Field {self.dependent_field} not found on data.")
112        
113        if self.dependency_field not in self.data.columns:
114            raise ValueError(f"Field {self.dependency_field} not found on data.")
115        
116        # Optional fields
117        if self.dependent_version_field is not None and self.dependent_version_field not in self.data.columns:
118            raise ValueError(f"Field {self.dependent_version_field} not found on data.")
119        
120        if self.dependency_version_field is not None and self.dependency_version_field not in self.data.columns:
121            raise ValueError(f"Field {self.dependency_version_field} not found on data.")
122        
123        if self.dependent_url_field is not None and self.dependent_url_field not in self.data.columns:
124            raise ValueError(f"Field {self.dependent_url_field} not found on data.")
125          
126    def obtain_package_names(self) -> List[str]:
127        """
128        Obtains the list of packages from the data source, sorted alphabetically.
129
130        Returns
131        -------
132        List[str]
133            The list of package names in the data source    
134
135        Examples
136        --------
137        >>> data_source = CSVDataSource("test.csv", "name", "dependency")
138        >>> data_source.obtain_package_names()
139        ["package1", "package2", "package3"]    
140        """
141
142        # Check if the data is loaded
143        if self.data is None:
144            raise ValueError("Data is not loaded.")
145        
146        # Return the list of packages
147        return sorted(self.data[self.dependent_field].unique())
148    
149    def obtain_package_data(self, package_name: str, override_previous: bool = True) -> dict:
150        """
151        Obtains the package from the dataframe
152        
153        Parameters
154        ----------
155        package_name : str
156            The name of the package
157        override_previous : bool
158            If True, it will override the previous data with the same name but different version
159        
160        Returns
161        -------
162        dict
163            The data of the package in the form of a dictionary
164
165        Examples
166        --------
167        >>> data_source = CSVDataSource("test.csv", "name", "dependency")
168        >>> data_source.obtain_package_data("package1")
169        {
170            "name": "package1",
171            "version": "1.0.0",
172            "url": "
173            "dependencies": [
174                {
175                    "name": "package2",
176                    "version": "1.0.0"
177                },
178            ]
179        }
180        """
181
182        # Check if the data is loaded
183        if self.data is None:
184            raise ValueError("Data is not loaded.")
185
186        # Get the rows of the package
187        package_rows = self.data[self.data[self.dependent_field] == package_name]
188
189        # Remove the previous data with the same name but different version
190        if override_previous:
191            # Get the last row
192            last_version = package_rows[self.dependent_version_field].max()
193            package_rows = package_rows[package_rows[self.dependent_version_field] == last_version]
194
195        if package_rows.empty:
196            self.logger.debug(f"Package {package_name} not found in data.")
197            return None
198            #raise ValueError(f"Package {package_name} not found in data.")
199
200        # Get the dependencies
201        dependencies = []
202
203        # Get a list of rows
204        package_rows = package_rows.to_dict("records")
205
206        for row in package_rows:
207            # Get the dependency name and version
208            dependency_name = row[self.dependency_field]
209            dependency_version = row[self.dependency_version_field] if self.dependency_version_field is not None else None
210            
211            # Build the dependency dictionary
212            # Iggnore {'name': nan, 'version': nan}
213            if pd.isna(dependency_name):
214                continue
215
216            dependency = {
217                "name": dependency_name,
218                "version": dependency_version
219            }
220
221            # Add the dependency to the list
222            dependencies.append(dependency)
223
224        # Return the data
225        return {
226            "name": package_name,
227            "version": package_rows[0][self.dependent_version_field] if self.dependent_version_field is not None else None,
228            "url": package_rows[0][self.dependent_url_field] if self.dependent_url_field is not None else None,
229            "dependencies": dependencies
230        }
231    
232    def obtain_packages_data(
233        self,
234        package_names: List[str],
235        progress_bar: Optional[tqdm.tqdm] = None
236    ) -> tuple[List[dict], List[str]]:
237        '''
238        Obtains the data of a list of package names from the CSV file
239        If the package name list is None, it will obtain the package names from the CSV file and load their data
240
241        Parameters
242        ----------
243        package_names : List[str]
244            The list of package names to obtain the data from
245        progress_bar : tqdm.tqdm
246            The progress bar to update
247
248        Returns
249        -------
250        tuple[List[dict], List[str]]
251            The list of packages data and the list of not found packages
252
253        Examples
254        --------
255        >>> data_source = CSVDataSource("test.csv", "name", "dependency")
256        >>> data_source.obtain_packages_data(["package1", "package2"])
257        (
258            [
259                {
260                    "name": "package1",
261                    "version": "1.0.0",
262                    "url": "
263                    "dependencies": [ ... ]
264                },
265                {
266                    "name": "package2",
267                    "version": "1.0.0",
268                    "url": "
269                    "dependencies": [ ... ]
270                }
271            ],
272            []
273        )
274        
275        '''
276        
277        # Define the list of packages and the list of not found packages
278        packages = []
279        not_found = []
280
281        # Iterate over the package names and obtain the data
282        for package_name in package_names:
283            try:
284                packages.append(self.obtain_package_data(package_name))
285
286            # If the package is not found, add it to the not found list, and continue
287            except ValueError:
288                self.logger.debug(f"Package {package_name} not found in data.")
289                not_found.append(package_name)
290                continue
291            
292            if progress_bar is not None:
293                progress_bar.update(1)
294                        
295        return packages, not_found
class CSVDataSource(olivia_finder.data_source.data_source.DataSource):
 10class CSVDataSource(DataSource):
 11    """
 12    Class that implements the methods for loading a network from a CSV file.
 13    Implements the **DataSource** abstract class.
 14    
 15    """
 16    
 17    def __init__(
 18        self,
 19        file_path: str, 
 20        dependent_field: Optional[str] = None,
 21        dependency_field: Optional[str] = None,
 22        dependent_version_field: Optional[str] = None,
 23        dependency_version_field: Optional[str] = None,
 24        dependent_url_field: Optional[str] = None
 25    ):
 26        """
 27        Constructor of the class
 28
 29        Parameters
 30        ----------
 31        file_path : str
 32            The path to the CSV file
 33        dependent_field : str, optional
 34            The name of the field that contains the dependent packages, by default None
 35        dependency_field : str, optional
 36            The name of the field that contains the dependency packages, by default None
 37        dependent_version_field : str, optional
 38            The name of the field that contains the dependent packages versions, by default None
 39        dependency_version_field : str, optional
 40            The name of the field that contains the dependency packages versions, by default None
 41        dependent_url_field : str, optional
 42            The name of the field that contains the dependent packages urls, by default None
 43
 44        Raises
 45        ------
 46        ValueError
 47            If the file path is None, If the file is not a CSV file, If the dependent field is None,
 48        """
 49
 50        # Set the dataframe as None and the fields
 51        self.data: Optional[pd.DataFrame] = None
 52        self.dependent_field = dependent_field
 53        self.dependency_field = dependency_field
 54        self.dependent_version_field = dependent_version_field
 55        self.dependency_version_field = dependency_version_field
 56        self.dependent_url_field = dependent_url_field
 57        self.file_path = file_path
 58
 59        # Initialize the logger
 60        super().__init__()
 61
 62        # Load the data if the file path is setted
 63        if self.file_path is not None:
 64            self._load_data()
 65        else:
 66            self.logger.debug("File path is None. Data not loaded.")
 67            raise ValueError("File path cannot be None.")
 68
 69    def _load_data(self):
 70        """
 71        Loads the data from a CSV file like [name,version,url,dependency,dependency_version]
 72        The dependent_version_field and dependent_url_field parameters are optional
 73
 74        Parameters
 75        ----------
 76        file_path : str
 77            The path to the CSV file
 78            
 79        Raises
 80        ------
 81        FileNotFoundError: Exception
 82            If the file does not exist
 83        ValueError: Exception
 84            If the file path is None, If the file is not a CSV file, If the dependent field is None, 
 85            If the dependency field is None, If the dependent field and dependency field are the same
 86        """
 87
 88        # Check the file is valid
 89        if self.file_path is None:
 90            raise ValueError("File path cannot be None.")
 91        
 92        if not os.path.exists(self.file_path):
 93            raise FileNotFoundError(f"File {self.file_path} not found.")
 94        
 95        if not self.file_path.endswith(".csv"):
 96            raise ValueError(f"File {self.file_path} is not a CSV file.")
 97        
 98        # Check if the mandatory fields are setted and are valid
 99        if self.dependent_field is None:
100            raise ValueError("Dependent field cannot be None.")
101        
102        if self.dependency_field is None:
103            raise ValueError("Dependency field cannot be None.")
104        
105        if self.dependent_field == self.dependency_field:
106            raise ValueError("Dependent field and dependency field cannot be the same.")
107        
108        # Load the data
109        self.data = pd.read_csv(self.file_path)
110        
111        # Mandatory fields
112        if self.dependent_field not in self.data.columns:
113            raise ValueError(f"Field {self.dependent_field} not found on data.")
114        
115        if self.dependency_field not in self.data.columns:
116            raise ValueError(f"Field {self.dependency_field} not found on data.")
117        
118        # Optional fields
119        if self.dependent_version_field is not None and self.dependent_version_field not in self.data.columns:
120            raise ValueError(f"Field {self.dependent_version_field} not found on data.")
121        
122        if self.dependency_version_field is not None and self.dependency_version_field not in self.data.columns:
123            raise ValueError(f"Field {self.dependency_version_field} not found on data.")
124        
125        if self.dependent_url_field is not None and self.dependent_url_field not in self.data.columns:
126            raise ValueError(f"Field {self.dependent_url_field} not found on data.")
127          
128    def obtain_package_names(self) -> List[str]:
129        """
130        Obtains the list of packages from the data source, sorted alphabetically.
131
132        Returns
133        -------
134        List[str]
135            The list of package names in the data source    
136
137        Examples
138        --------
139        >>> data_source = CSVDataSource("test.csv", "name", "dependency")
140        >>> data_source.obtain_package_names()
141        ["package1", "package2", "package3"]    
142        """
143
144        # Check if the data is loaded
145        if self.data is None:
146            raise ValueError("Data is not loaded.")
147        
148        # Return the list of packages
149        return sorted(self.data[self.dependent_field].unique())
150    
151    def obtain_package_data(self, package_name: str, override_previous: bool = True) -> dict:
152        """
153        Obtains the package from the dataframe
154        
155        Parameters
156        ----------
157        package_name : str
158            The name of the package
159        override_previous : bool
160            If True, it will override the previous data with the same name but different version
161        
162        Returns
163        -------
164        dict
165            The data of the package in the form of a dictionary
166
167        Examples
168        --------
169        >>> data_source = CSVDataSource("test.csv", "name", "dependency")
170        >>> data_source.obtain_package_data("package1")
171        {
172            "name": "package1",
173            "version": "1.0.0",
174            "url": "
175            "dependencies": [
176                {
177                    "name": "package2",
178                    "version": "1.0.0"
179                },
180            ]
181        }
182        """
183
184        # Check if the data is loaded
185        if self.data is None:
186            raise ValueError("Data is not loaded.")
187
188        # Get the rows of the package
189        package_rows = self.data[self.data[self.dependent_field] == package_name]
190
191        # Remove the previous data with the same name but different version
192        if override_previous:
193            # Get the last row
194            last_version = package_rows[self.dependent_version_field].max()
195            package_rows = package_rows[package_rows[self.dependent_version_field] == last_version]
196
197        if package_rows.empty:
198            self.logger.debug(f"Package {package_name} not found in data.")
199            return None
200            #raise ValueError(f"Package {package_name} not found in data.")
201
202        # Get the dependencies
203        dependencies = []
204
205        # Get a list of rows
206        package_rows = package_rows.to_dict("records")
207
208        for row in package_rows:
209            # Get the dependency name and version
210            dependency_name = row[self.dependency_field]
211            dependency_version = row[self.dependency_version_field] if self.dependency_version_field is not None else None
212            
213            # Build the dependency dictionary
214            # Iggnore {'name': nan, 'version': nan}
215            if pd.isna(dependency_name):
216                continue
217
218            dependency = {
219                "name": dependency_name,
220                "version": dependency_version
221            }
222
223            # Add the dependency to the list
224            dependencies.append(dependency)
225
226        # Return the data
227        return {
228            "name": package_name,
229            "version": package_rows[0][self.dependent_version_field] if self.dependent_version_field is not None else None,
230            "url": package_rows[0][self.dependent_url_field] if self.dependent_url_field is not None else None,
231            "dependencies": dependencies
232        }
233    
234    def obtain_packages_data(
235        self,
236        package_names: List[str],
237        progress_bar: Optional[tqdm.tqdm] = None
238    ) -> tuple[List[dict], List[str]]:
239        '''
240        Obtains the data of a list of package names from the CSV file
241        If the package name list is None, it will obtain the package names from the CSV file and load their data
242
243        Parameters
244        ----------
245        package_names : List[str]
246            The list of package names to obtain the data from
247        progress_bar : tqdm.tqdm
248            The progress bar to update
249
250        Returns
251        -------
252        tuple[List[dict], List[str]]
253            The list of packages data and the list of not found packages
254
255        Examples
256        --------
257        >>> data_source = CSVDataSource("test.csv", "name", "dependency")
258        >>> data_source.obtain_packages_data(["package1", "package2"])
259        (
260            [
261                {
262                    "name": "package1",
263                    "version": "1.0.0",
264                    "url": "
265                    "dependencies": [ ... ]
266                },
267                {
268                    "name": "package2",
269                    "version": "1.0.0",
270                    "url": "
271                    "dependencies": [ ... ]
272                }
273            ],
274            []
275        )
276        
277        '''
278        
279        # Define the list of packages and the list of not found packages
280        packages = []
281        not_found = []
282
283        # Iterate over the package names and obtain the data
284        for package_name in package_names:
285            try:
286                packages.append(self.obtain_package_data(package_name))
287
288            # If the package is not found, add it to the not found list, and continue
289            except ValueError:
290                self.logger.debug(f"Package {package_name} not found in data.")
291                not_found.append(package_name)
292                continue
293            
294            if progress_bar is not None:
295                progress_bar.update(1)
296                        
297        return packages, not_found

Class that implements the methods for loading a network from a CSV file. Implements the DataSource abstract class.

CSVDataSource( file_path: str, dependent_field: Optional[str] = None, dependency_field: Optional[str] = None, dependent_version_field: Optional[str] = None, dependency_version_field: Optional[str] = None, dependent_url_field: Optional[str] = None)
17    def __init__(
18        self,
19        file_path: str, 
20        dependent_field: Optional[str] = None,
21        dependency_field: Optional[str] = None,
22        dependent_version_field: Optional[str] = None,
23        dependency_version_field: Optional[str] = None,
24        dependent_url_field: Optional[str] = None
25    ):
26        """
27        Constructor of the class
28
29        Parameters
30        ----------
31        file_path : str
32            The path to the CSV file
33        dependent_field : str, optional
34            The name of the field that contains the dependent packages, by default None
35        dependency_field : str, optional
36            The name of the field that contains the dependency packages, by default None
37        dependent_version_field : str, optional
38            The name of the field that contains the dependent packages versions, by default None
39        dependency_version_field : str, optional
40            The name of the field that contains the dependency packages versions, by default None
41        dependent_url_field : str, optional
42            The name of the field that contains the dependent packages urls, by default None
43
44        Raises
45        ------
46        ValueError
47            If the file path is None, If the file is not a CSV file, If the dependent field is None,
48        """
49
50        # Set the dataframe as None and the fields
51        self.data: Optional[pd.DataFrame] = None
52        self.dependent_field = dependent_field
53        self.dependency_field = dependency_field
54        self.dependent_version_field = dependent_version_field
55        self.dependency_version_field = dependency_version_field
56        self.dependent_url_field = dependent_url_field
57        self.file_path = file_path
58
59        # Initialize the logger
60        super().__init__()
61
62        # Load the data if the file path is setted
63        if self.file_path is not None:
64            self._load_data()
65        else:
66            self.logger.debug("File path is None. Data not loaded.")
67            raise ValueError("File path cannot be None.")

Constructor of the class

Parameters
  • file_path (str): The path to the CSV file
  • dependent_field (str, optional): The name of the field that contains the dependent packages, by default None
  • dependency_field (str, optional): The name of the field that contains the dependency packages, by default None
  • dependent_version_field (str, optional): The name of the field that contains the dependent packages versions, by default None
  • dependency_version_field (str, optional): The name of the field that contains the dependency packages versions, by default None
  • dependent_url_field (str, optional): The name of the field that contains the dependent packages urls, by default None
Raises
  • ValueError: If the file path is None, If the file is not a CSV file, If the dependent field is None,
def obtain_package_names(self) -> List[str]:
128    def obtain_package_names(self) -> List[str]:
129        """
130        Obtains the list of packages from the data source, sorted alphabetically.
131
132        Returns
133        -------
134        List[str]
135            The list of package names in the data source    
136
137        Examples
138        --------
139        >>> data_source = CSVDataSource("test.csv", "name", "dependency")
140        >>> data_source.obtain_package_names()
141        ["package1", "package2", "package3"]    
142        """
143
144        # Check if the data is loaded
145        if self.data is None:
146            raise ValueError("Data is not loaded.")
147        
148        # Return the list of packages
149        return sorted(self.data[self.dependent_field].unique())

Obtains the list of packages from the data source, sorted alphabetically.

Returns
  • List[str]: The list of package names in the data source
Examples
>>> data_source = CSVDataSource("test.csv", "name", "dependency")
>>> data_source.obtain_package_names()
["package1", "package2", "package3"]
def obtain_package_data(self, package_name: str, override_previous: bool = True) -> dict:
151    def obtain_package_data(self, package_name: str, override_previous: bool = True) -> dict:
152        """
153        Obtains the package from the dataframe
154        
155        Parameters
156        ----------
157        package_name : str
158            The name of the package
159        override_previous : bool
160            If True, it will override the previous data with the same name but different version
161        
162        Returns
163        -------
164        dict
165            The data of the package in the form of a dictionary
166
167        Examples
168        --------
169        >>> data_source = CSVDataSource("test.csv", "name", "dependency")
170        >>> data_source.obtain_package_data("package1")
171        {
172            "name": "package1",
173            "version": "1.0.0",
174            "url": "
175            "dependencies": [
176                {
177                    "name": "package2",
178                    "version": "1.0.0"
179                },
180            ]
181        }
182        """
183
184        # Check if the data is loaded
185        if self.data is None:
186            raise ValueError("Data is not loaded.")
187
188        # Get the rows of the package
189        package_rows = self.data[self.data[self.dependent_field] == package_name]
190
191        # Remove the previous data with the same name but different version
192        if override_previous:
193            # Get the last row
194            last_version = package_rows[self.dependent_version_field].max()
195            package_rows = package_rows[package_rows[self.dependent_version_field] == last_version]
196
197        if package_rows.empty:
198            self.logger.debug(f"Package {package_name} not found in data.")
199            return None
200            #raise ValueError(f"Package {package_name} not found in data.")
201
202        # Get the dependencies
203        dependencies = []
204
205        # Get a list of rows
206        package_rows = package_rows.to_dict("records")
207
208        for row in package_rows:
209            # Get the dependency name and version
210            dependency_name = row[self.dependency_field]
211            dependency_version = row[self.dependency_version_field] if self.dependency_version_field is not None else None
212            
213            # Build the dependency dictionary
214            # Iggnore {'name': nan, 'version': nan}
215            if pd.isna(dependency_name):
216                continue
217
218            dependency = {
219                "name": dependency_name,
220                "version": dependency_version
221            }
222
223            # Add the dependency to the list
224            dependencies.append(dependency)
225
226        # Return the data
227        return {
228            "name": package_name,
229            "version": package_rows[0][self.dependent_version_field] if self.dependent_version_field is not None else None,
230            "url": package_rows[0][self.dependent_url_field] if self.dependent_url_field is not None else None,
231            "dependencies": dependencies
232        }

Obtains the package from the dataframe

Parameters
  • package_name (str): The name of the package
  • override_previous (bool): If True, it will override the previous data with the same name but different version
Returns
  • dict: The data of the package in the form of a dictionary
Examples
>>> data_source = CSVDataSource("test.csv", "name", "dependency")
>>> data_source.obtain_package_data("package1")
{
    "name": "package1",
    "version": "1.0.0",
    "url": "
    "dependencies": [
        {
            "name": "package2",
            "version": "1.0.0"
        },
    ]
}
def obtain_packages_data( self, package_names: List[str], progress_bar: Optional[tqdm.std.tqdm] = None) -> tuple[typing.List[dict], typing.List[str]]:
234    def obtain_packages_data(
235        self,
236        package_names: List[str],
237        progress_bar: Optional[tqdm.tqdm] = None
238    ) -> tuple[List[dict], List[str]]:
239        '''
240        Obtains the data of a list of package names from the CSV file
241        If the package name list is None, it will obtain the package names from the CSV file and load their data
242
243        Parameters
244        ----------
245        package_names : List[str]
246            The list of package names to obtain the data from
247        progress_bar : tqdm.tqdm
248            The progress bar to update
249
250        Returns
251        -------
252        tuple[List[dict], List[str]]
253            The list of packages data and the list of not found packages
254
255        Examples
256        --------
257        >>> data_source = CSVDataSource("test.csv", "name", "dependency")
258        >>> data_source.obtain_packages_data(["package1", "package2"])
259        (
260            [
261                {
262                    "name": "package1",
263                    "version": "1.0.0",
264                    "url": "
265                    "dependencies": [ ... ]
266                },
267                {
268                    "name": "package2",
269                    "version": "1.0.0",
270                    "url": "
271                    "dependencies": [ ... ]
272                }
273            ],
274            []
275        )
276        
277        '''
278        
279        # Define the list of packages and the list of not found packages
280        packages = []
281        not_found = []
282
283        # Iterate over the package names and obtain the data
284        for package_name in package_names:
285            try:
286                packages.append(self.obtain_package_data(package_name))
287
288            # If the package is not found, add it to the not found list, and continue
289            except ValueError:
290                self.logger.debug(f"Package {package_name} not found in data.")
291                not_found.append(package_name)
292                continue
293            
294            if progress_bar is not None:
295                progress_bar.update(1)
296                        
297        return packages, not_found

Obtains the data of a list of package names from the CSV file If the package name list is None, it will obtain the package names from the CSV file and load their data

Parameters
  • package_names (List[str]): The list of package names to obtain the data from
  • progress_bar (tqdm.tqdm): The progress bar to update
Returns
  • tuple[List[dict], List[str]]: The list of packages data and the list of not found packages
Examples
>>> data_source = CSVDataSource("test.csv", "name", "dependency")
>>> data_source.obtain_packages_data(["package1", "package2"])
(
    [
        {
            "name": "package1",
            "version": "1.0.0",
            "url": "
            "dependencies": [ ... ]
        },
        {
            "name": "package2",
            "version": "1.0.0",
            "url": "
            "dependencies": [ ... ]
        }
    ],
    []
)