24def DownloadPubChemCompoundsToxicity():
25 """
26 Скачиваем информацию о токсичности соединений из базы данных PubChem на
27 основе конфигурации (`config.json`).
28 """
29
30
31 results_folder_kg: str = f"{toxicity_config['results_folder_name']}/kg"
32
33 results_folder_m3: str = f"{toxicity_config['results_folder_name']}/m3"
34
35
36 if config["testing_flag"]:
37 toxicity_config["start_page"] = 1
38 toxicity_config["end_page"] = 3
39
40 v_logger.UpdateFormat(toxicity_config["logger_label"], toxicity_config["logger_color"])
41
42 v_logger.info(f"{'• ' * 10} PubChem downloading for DrugDesign.")
43
44
45 if (
46 not config["skip_downloaded"]
47 or not IsFileInFolder(
48 f"{toxicity_config['combined_file_name']}_m3.csv",
49 toxicity_config["results_folder_name"],
50 )
51 or not IsFileInFolder(
52 f"{toxicity_config['combined_file_name']}_kg.csv",
53 toxicity_config["results_folder_name"],
54 )
55 ):
56
57 for page_num in range(toxicity_config["start_page"], toxicity_config["end_page"] + 1):
58 v_logger.info(f"Downloading page_{page_num}...")
59
60
61 page_folder_name = (
62 f"{toxicity_config['results_folder_name']}/{{unit_type}}/page_{{page_num}}"
63 )
64
65
66 if config["skip_downloaded"] and (
67 os.path.exists(page_folder_name.format(unit_type="kg", page_num=page_num + 1))
68 or os.path.exists(page_folder_name.format(unit_type="m3", page_num=page_num + 1))
69 ):
70 v_logger.info(f"Folder for page_{page_num} is already exists, skip.")
71 continue
72
73
74 if config["skip_downloaded"]:
75 full_quarter_file_name: str = (
76 f"{toxicity_config['results_file_name']}_100_page_{page_num}"
77 )
78
79 full_quarter_path = (
80 f"{toxicity_config['results_folder_name']}/"
81 "{unit_type}/"
82 f"{full_quarter_file_name}.csv"
83 )
84
85 if os.path.exists(full_quarter_path.format(unit_type="kg")) or os.path.exists(
86 full_quarter_path.format(unit_type="m3")
87 ):
88 v_logger.info(f"100 quarter file for page_{page_num} is already exists, skip.")
89 continue
90
91
92 os.makedirs(
93 page_folder_name.format(unit_type="kg", page_num=page_num), exist_ok=True
94 )
95 os.makedirs(
96 page_folder_name.format(unit_type="m3", page_num=page_num), exist_ok=True
97 )
98
99
100 compound_link: str = (
101 "https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/annotations/"
102 "heading/JSON"
103 "?heading=Acute+Effects"
104 f"&page={page_num}"
105 )
106
107
108 data = GetResponse(compound_link, False, toxicity_config["sleep_time"]).json()[
109 "Annotations"
110 ]
111
112
113 annotation_len = len(data["Annotation"])
114 v_logger.info(f"Amount: {annotation_len}", LogMode.VERBOSELY)
115
116
117 quarters: dict[int, int] = {
118 annotation_len - 1: 100,
119 int(0.75 * annotation_len): 75,
120 int(0.50 * annotation_len): 50,
121 int(0.25 * annotation_len): 25,
122 }
123
124
125 total_pages = int(data["TotalPages"])
126
127
128 if page_num > total_pages:
129 v_logger.LogException(
130 IndexError(
131 f"Invalid page index: '{page_num}'! Should be: 1 < 'page' < {total_pages}"
132 )
133 )
134 continue
135
136
137 for i, compound_data in enumerate(data["Annotation"]):
138
139 start_time = time.time()
140
141
142 DownloadCompoundToxicity(
143 compound_data,
144 f"{toxicity_config['results_folder_name']}/{{unit_type}}/page_{page_num}",
145 )
146
147
148 end_time = time.time()
149
150
151 if config["testing_flag"]:
152 v_logger.info(
153 f"Prev compound: {i}, time: {(end_time - start_time):.3f} sec.",
154 LogMode.VERBOSELY,
155 )
156
157
158 if i in quarters.keys() and toxicity_config["need_combining"]:
159
160 quarter = quarters[i]
161
162 v_logger.info(
163 f"Quarter: {quarter}%, combining files in page_{page_num} folder..."
164 )
165
166
167 CombineCSVInFolder(
168 page_folder_name.format(unit_type="kg", page_num=page_num),
169 f"{toxicity_config['results_file_name']}_{quarters[i]}_page_{page_num}",
170 )
171
172
173 CombineCSVInFolder(
174 page_folder_name.format(unit_type="m3", page_num=page_num),
175 f"{toxicity_config['results_file_name']}_{quarters[i]}_page_{page_num}",
176 )
177
178 v_logger.success(
179 f"Quarter: {quarter}%, combining files in page_{page_num} folder!"
180 )
181
182
183 v_logger.info(
184 f"Moving {toxicity_config['results_file_name']}_"
185 f"{quarters[i]}_page_{page_num}.csv to "
186 f"{toxicity_config['results_folder_name']}...",
187 LogMode.VERBOSELY,
188 )
189
190
191 quarter_file_name = (
192 f"{toxicity_config['results_file_name']}_{quarters[i]}_page_{page_num}.csv"
193 )
194
195
196 MoveFileToFolder(
197 quarter_file_name,
198 page_folder_name.format(unit_type="kg", page_num=page_num),
199 results_folder_kg,
200 )
201
202
203 MoveFileToFolder(
204 quarter_file_name,
205 page_folder_name.format(unit_type="m3", page_num=page_num),
206 results_folder_m3,
207 )
208
209 v_logger.success(
210 f"Moving {quarter_file_name} to {toxicity_config['results_folder_name']}!",
211 LogMode.VERBOSELY,
212 )
213
214
215 prev_quarter = quarter - 25
216
217
218 if prev_quarter != 0:
219
220 old_quarter_file_name: str = (
221 f"{toxicity_config['results_file_name']}_{prev_quarter}_page_{page_num}"
222 )
223
224 v_logger.info("Deleting old quarter file...", LogMode.VERBOSELY)
225
226
227 if os.path.exists(
228 os.path.join(results_folder_kg, f"{old_quarter_file_name}.csv")
229 ):
230 os.remove(os.path.join(results_folder_kg, f"{old_quarter_file_name}.csv"))
231
232
233 if os.path.exists(
234 os.path.join(results_folder_m3, f"{old_quarter_file_name}.csv")
235 ):
236 os.remove(os.path.join(results_folder_m3, f"{old_quarter_file_name}.csv"))
237
238 v_logger.success("Deleting old quarter file!", LogMode.VERBOSELY)
239
240 if toxicity_config["need_combining"]:
241
242 CombineCSVInFolder(results_folder_kg, f"{toxicity_config['combined_file_name']}_kg")
243
244
245 MoveFileToFolder(
246 f"{toxicity_config['combined_file_name']}_kg.csv",
247 results_folder_kg,
248 toxicity_config["results_folder_name"],
249 )
250
251
252 CombineCSVInFolder(results_folder_m3, f"{toxicity_config['combined_file_name']}_m3")
253
254
255 MoveFileToFolder(
256 f"{toxicity_config['combined_file_name']}_m3.csv",
257 results_folder_m3,
258 toxicity_config["results_folder_name"],
259 )
260
261
262 if toxicity_config["delete_after_combining"] and toxicity_config["need_combining"]:
263 v_logger.info(
264 f"Deleting files after combining in "
265 f"'{toxicity_config['results_folder_name']}'...",
266 LogMode.VERBOSELY,
267 )
268
269
270 except_items: list[str] = [
271 f"{toxicity_config['combined_file_name']}_kg.csv",
272 f"{toxicity_config['combined_file_name']}_m3.csv",
273 ]
274
275 molfiles_folder_name: str = toxicity_config["molfiles_folder_name"]
276
277
278 if toxicity_config["results_folder_name"] in molfiles_folder_name:
279 except_items.append(
280 molfiles_folder_name.replace(toxicity_config["results_folder_name"], "").split(
281 "/"
282 )[1]
283 )
284
285
286 DeleteFilesInFolder(
287 toxicity_config["results_folder_name"], except_items, delete_folders=True
288 )
289
290 v_logger.success(
291 f"Deleting files after combining in '{toxicity_config['results_folder_name']}'!",
292 LogMode.VERBOSELY,
293 )
294
295
296 else:
297 v_logger.info(
298 f"{toxicity_config['results_file_name']} is already downloaded, skip.",
299 LogMode.VERBOSELY,
300 )
301
302 if filtering_config["need_filtering_by_characteristics"]:
303 v_logger.info("·", LogMode.VERBOSELY)
304
305 FilterDownloadedToxicityByCharacteristics(
306 "m3", "organism", "route", "time_period", "testtype"
307 )
308
309 v_logger.info()
310
311 FilterDownloadedToxicityByCharacteristics(
312 "kg", "organism", "route", "time_period", "testtype"
313 )
314
315 v_logger.success(f"{'• ' * 10} PubChem downloading for DrugDesign!")
316 v_logger.info()