-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodel.py
307 lines (280 loc) · 14.1 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
import pandas as pd
from pandas import DataFrame
class convertor:
"""
A class to handle operations on a dataframe loaded from a CSV file,
including cleaning, data type conversion, and exporting.
"""
def __init__(self, filePath:str = "C:/file/path/here.csv") -> None:
"""
Initialize the class with the path to a CSV file and load it into a dataframe.
Args:
filePath (str, optional): The file path to the CSV file. Defaults to "C:/file/path/here".
"""
self.path = filePath
self.dataframe = DataFrame
try:
self.dataframe = pd.read_csv(filePath, encoding='latin-1')
except FileNotFoundError:
print("File Not Found | The provided filepath is invalid")
else:
print("File has been read successfully!")
def getDataframe(self) -> DataFrame:
"""
Get the dataframe loaded from the CSV file.
Returns:
DataFrame: The dataframe loaded from the CSV file.
"""
return self.dataframe
def getInfo(self) -> str:
"""
Print a concise summary of the dataframe.
Returns:
String: Summary of the dataframe
"""
return self.getDataframe().info()
def getColumnLength(self) -> int:
"""
Get the number of columns in the dataframe.
Returns:
int: The number of columns in the dataframe.
"""
return len(self.dataframe.columns)
def columnsCheck(self) -> int:
"""
Check for the presence of columns with NA/null values.
Returns:
int: The number of columns that contain NA/null values.
"""
na_counter = 0
for i in self.dataframe:
if self.dataframe[f'{i}'].isnull().sum() > 0:
na_counter += 1
else:
continue
return na_counter
def cleanDataset(self) -> str:
"""
Clean the dataset by removing rows with any NA/null values.
Returns:
str: A message indicating the result of the cleaning operation.
"""
try:
if self.columnsCheck() > 0:
self.dataframe = self.dataframe.dropna(how='any',axis=0)
else:
return True ,f"Dataset is clear of any na/null values"
except Exception as e:
return False, f"An error occurred while cleaning columns of the selected dataframe | {e}"
else:
return True ,f"Dataframe has been updated and cleared of any na/null values"
def convertDataType(self, attr: str, dataType: str) -> tuple:
"""
Convert the data type of a specified column in the dataframe.
Args:
attr (str): The name of the column to be converted.
dataType (str): The target data type for the column.
Possible values:
'string', 'int32', 'int64',
'bool', 'float64', 'float32',
'float', 'object'.
Raises:
UserWarning (1): If the column name is not found in the dataframe.
UserWarning (2): If the specified data type is not a valid option.
Returns:
tuple: A tuple containing a boolean indicating success or failure,
and a message with details.
"""
try:
if attr not in self.getDataframe():
raise UserWarning
except UserWarning:
return False, f"column not in dataframe"
else:
try:
datatypes_options = ['string', 'int32', 'int64',
'bool', 'float64', 'float32',
'float', 'object']
if dataType.lower() not in datatypes_options:
raise UserWarning
except UserWarning:
return False, f"An error occurred while processing datatype options"
else:
try:
self.dataframe = self.dataframe.astype({
f'{attr}': f'{dataType.lower()}'
})
except:
return False, f"An error occurred while converting the selected attribute"
else:
print(self.dataframe[f"{attr}"].dtype)
return True, f"Attribute is converted successfully as {self.dataframe[f'{attr}'].dtype}"
def autoConvertDatatype(self) -> tuple:
try:
validating_data = self.columnsCheck()
if validating_data > 0:
raise UserWarning
except UserWarning:
return False, f"Unable To Export Data | Your dataframe contains na/null values"
except Exception as e:
return False, f"An error occurred while validating data of the selected dataframe | {e}"
else:
try:
self.dataframe = self.dataframe.convert_dtypes(
infer_objects=True, convert_string=True, convert_integer=True,
convert_boolean=True, convert_floating=True)
except:
return False, f"An error occurred while converting datatypes on the selected dataframe"
else:
return True, f"Datatypes conversion is successful\nIt is advised to double check the dataframe"
def data_sql_modifier(self, table_name:str = 'tableXyz', output:int = 2) -> tuple:
"""
Generates a SQL CREATE TABLE statement based on the given DataFrame's schema.
Args:
table_name (str, optional): The name of the table to be created. Defaults to 'tableXyz'.
output (int, optional): Determines the output format. If 1, returns the fetched column info. If 2, prints and returns the SQL statement. Defaults to 2.
df (DataFrame): represents the dataframe stored/handled within the class/object.
Raises:
UserWarning: will be raised if there is a mismatch/failure/ and missing data OR certain requirements are not met.
Returns:
tuple: A tuple where the first element is a boolean indicating success or failure, and the second element is either:
- A list of tuples (column_name, datatype) if `output` is 1.
- A SQL CREATE TABLE statement if `output` is 2.
- An error message if there is a failure.
"""
df = self.dataframe
try:
head_statement = f"CREATE TABLE {table_name} ("
dataframe_info = []
for column in df:
column_name = str(column)
column_datatype = str(df[column_name].dtype)
column_name = column_name.replace(" ", "_")
tuple = (column_name.lower(), column_datatype.lower())
dataframe_info.append(tuple)
except Exception as e:
return False, f"An error occurred while reading datatypes on the selected dataframe | {e}"
else:
try:
fetched_info = []
sql_dType_options = [
('string', 'VARCHAR(255)'), ('int64', 'BIGINT'), ('float64', 'FLOAT'), ('DOUBLE', 'DOUBLE'),
('bool', 'BOOL'), ('object', 'VARCHAR(255)'), ('datetime64[ns]', 'TIMESTAMP'), ('BLOB', 'BLOB')
]
for content in dataframe_info:
cname = content[0]
ctype = content[1]
for option in sql_dType_options:
if ctype == option[0]:
fetched_tuple = (cname, option[1])
fetched_info.append(fetched_tuple)
else:
continue
except Exception as e:
return False, f"An error occurred while fetching datatypes of the selected dataframe | {e}"
else:
if output == 1:
return True, fetched_info
else:
try:
processed_columns = len(dataframe_info)
fetched_columns = len(fetched_info)
if processed_columns != fetched_columns:
raise UserWarning
else:
print(f'| Processed columns: {processed_columns}\n| Fetched columns: {fetched_columns}') # to verify fetching process is successful
except UserWarning:
return False, f"An error occurred while validating the fetched datatypes of the selected dataframe | Not all columns are fetched, some are missing" # edit the return message
except Exception as e:
return False, f"An error occurred while validating the fetched datatypes of the selected dataframe | {e}"
else:
try:
body_statement = ""
tuple_count = len(fetched_info)
for tuple in fetched_info:
if tuple_count > 1:
tuple_count -= 1
body_statement = f"{body_statement}\n{tuple[0]} {tuple[1]},"
else:
body_statement = f"{body_statement}\n{tuple[0]} {tuple[1]}\n);\n"
except Exception as e:
return False, f"An error occurred while generating the SQL statement of the selected dataframe | {e}"
else:
sql_statement = f"{head_statement}{body_statement}"
return True, sql_statement
def export_as_mysql(self, table_name:str = 'tableXyz', output:int = 2, path:str = "Output") -> tuple:
"""
Exports data to a MySQL formatted file.
Args:
table_name (str, optional): The name of the table to which data will be exported. Defaults to 'tableXyz'.
output (int, optional): Specifies the type of output modification to be applied. Defaults to 2.
Available Choices: 1, and 2 (If 1, returns the fetched column info. If 2, prints and returns the SQL statement. Defaults to 2.)
path (str, optional): The file path where the output will be saved. Defaults to "Output".
Note: Do not include a file formate as '.txt' to the output chosen path or name.
Example Of Path Formate: /C:/Folder/converter/assets/file_name (the function will export the output file in a .txt formate automatically)
Raises:
UserWarning(1): If the dataframe contains NaN or null values.
UserWarning(2): If data_sql_modifier fails to modify/return the data.
Returns:
tuple: A tuple containing a boolean status and a message.
- (True, "File has been created successfully!") on success.
- (False, "Error message") on failure, where "Error message" is a description of what went wrong OR default error message.
"""
try:
validating_data = self.columnsCheck()
insert_statement = f"INSERT INTO {table_name} VALUES \n"
if validating_data > 0:
raise UserWarning
except UserWarning:
return False, f"Unable To Export Data | Your dataframe contains na/null values"
except Exception as e:
return False, f"An error occurred while validating data of the selected dataframe | {e}"
else:
try:
state, imported_content = self.data_sql_modifier(table_name, output)
if state == False:
raise UserWarning
except UserWarning:
state, imported_content
else:
try:
file = open(path + ".txt", "w")
except Exception as e:
return False, f"An error occurred while creating output file | {e}"
else:
try:
imported_content = imported_content + "\n"
file.write(imported_content)
except Exception as e:
return False, f"An error occurred while writing 'imported_content' to output file | {e}"
else:
try:
file.write(insert_statement)
except Exception as e:
return False, f"An error occurred while writing 'insert_statement' to output file | {e}"
else:
try:
content = []
for i in range(len(self.getDataframe())):
record = tuple(self.getDataframe().iloc[i])
content.append(record)
except Exception as e:
return False, f"An error occurred while fetching data | {e}"
else:
try:
counter = len(content)
for record in content:
counter -= 1
if counter > 0:
file.write(str(record) + ',' + '\n')
else:
file.write(str(record) + ';')
except Exception as e:
return False, f"An error occurred while writing output file | {e}"
else:
try:
file.close()
except Exception as e:
return False, f"An error occurred while closing output file | {e}"
else:
return True, f"File has been created successfully!"