18def DownloadPubChemCompoundsToxicity():
19 """
20 Скачиваем информацию о токсичности соединений из базы данных PubChem на
21 основе конфигурации (`config.json`).
22 """
23
24
25 toxicity_config: Config = config["PubChem_download_toxicity"]
26
27
28 results_folder_kg: str = f"{toxicity_config["results_folder_name"]}/kg"
29
30 results_folder_m3: str = f"{toxicity_config["results_folder_name"]}/m3"
31
32
33 if config["testing_flag"]:
34 toxicity_config["start_page"] = 1
35 toxicity_config["end_page"] = 3
36
37 v_logger.UpdateFormat(toxicity_config["logger_label"],
38 toxicity_config["logger_color"])
39
40 v_logger.info(f"{"• " * 10} PubChem downloading for DrugDesign.")
41
42
43 if not config["skip_downloaded"] or\
44 not IsFileInFolder(
45 f"{toxicity_config["combined_file_name"]}_m3.csv",
46 toxicity_config["results_folder_name"]) or\
47 not IsFileInFolder(
48 f"{toxicity_config["combined_file_name"]}_kg.csv",
49 toxicity_config["results_folder_name"]):
50
51 for page_num in range(toxicity_config["start_page"],
52 toxicity_config["end_page"] + 1):
53 v_logger.info(f"Downloading page_{page_num}...")
54
55
56 page_folder_name = f"{toxicity_config["results_folder_name"]}/"\
57 "{unit_type}/page_{page_num}"
58
59
60 if config["skip_downloaded"] and\
61 (os.path.exists(page_folder_name.format(unit_type="kg",
62 page_num=page_num + 1)) or
63 os.path.exists(page_folder_name.format(unit_type="m3",
64 page_num=page_num + 1))):
65 v_logger.info(f"Folder for page_{page_num} is already exists, skip.")
66 continue
67
68
69 os.makedirs(page_folder_name.format(unit_type="kg",
70 page_num=page_num), exist_ok=True)
71 os.makedirs(page_folder_name.format(unit_type="m3",
72 page_num=page_num), exist_ok=True)
73
74
75 compound_link: str =\
76 "https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/annotations/"\
77 "heading/JSON"\
78 "?heading=Acute+Effects"\
79 f"&page={page_num}"
80
81
82 data = GetResponse(compound_link,
83 False,
84 toxicity_config["sleep_time"]).json()["Annotations"]
85
86
87 annotation_len = len(data["Annotation"])
88 v_logger.info(f"Amount: {annotation_len}", LogMode.VERBOSELY)
89
90
91 quarters: dict[int, int] = {annotation_len - 1: 100,
92 int(0.75 * annotation_len): 75,
93 int(0.50 * annotation_len): 50,
94 int(0.25 * annotation_len): 25}
95
96
97 total_pages = int(data["TotalPages"])
98
99
100 if page_num > total_pages:
101 v_logger.LogException(IndexError(
102 f"Invalid page index: '{page_num}'! Should be: 1 < 'page' < "
103 f"{total_pages}"))
104 continue
105
106
107 for i, compound_data in enumerate(data["Annotation"]):
108
109 start_time = time.time()
110
111
112 DownloadCompoundToxicity(compound_data,
113 f"{toxicity_config["results_folder_name"]}/"
114 "{unit_type}/"f"page_{page_num}")
115
116
117 end_time = time.time()
118
119
120 if config["testing_flag"]:
121 v_logger.info(
122 f"Prev compound: {i}, time: {(end_time - start_time):.3f}"
123 f" sec.",
124 LogMode.VERBOSELY)
125
126
127 if i in quarters.keys() and toxicity_config["need_combining"]:
128
129 quarter = quarters[i]
130
131 v_logger.info(f"Quarter: {quarter}%, combining files in "
132 f"page_{page_num} folder...")
133
134
135 CombineCSVInFolder(page_folder_name.format(unit_type="kg",
136 page_num=page_num),
137 f"{toxicity_config["results_file_name"]}_"
138 f"{quarters[i]}_page_{page_num}")
139
140
141 CombineCSVInFolder(page_folder_name.format(unit_type="m3",
142 page_num=page_num),
143 f"{toxicity_config["results_file_name"]}_"
144 f"{quarters[i]}_page_{page_num}")
145
146 v_logger.success(f"Quarter: {quarter}%, combining files in "
147 f"page_{page_num} folder!")
148
149
150 v_logger.info(
151 f"Moving {toxicity_config["results_file_name"]}_"
152 f"{quarters[i]}_page_{page_num}.csv to "
153 f"{toxicity_config["results_folder_name"]}...",
154 LogMode.VERBOSELY)
155
156
157 quarter_file_name = f"{toxicity_config["results_file_name"]}_"\
158 f"{quarters[i]}_page_{page_num}.csv"
159
160
161 MoveFileToFolder(quarter_file_name,
162 page_folder_name.format(unit_type="kg",
163 page_num=page_num),
164 results_folder_kg)
165
166
167 MoveFileToFolder(quarter_file_name,
168 page_folder_name.format(unit_type="m3",
169 page_num=page_num),
170 results_folder_m3)
171
172 v_logger.success(
173 f"Moving {quarter_file_name} to "
174 f"{toxicity_config["results_folder_name"]}!",
175 LogMode.VERBOSELY)
176
177
178 prev_quarter = quarter - 25
179
180
181 if prev_quarter != 0:
182
183 old_quarter_file_name: str =\
184 f"{toxicity_config["results_file_name"]}_"\
185 f"{prev_quarter}_page_{page_num}"
186
187 v_logger.info("Deleting old quarter file...",
188 LogMode.VERBOSELY)
189
190
191 if os.path.exists(os.path.join(results_folder_kg,
192 f"{old_quarter_file_name}.csv")):
193 os.remove(os.path.join(
194 results_folder_kg,
195 f"{old_quarter_file_name}.csv"))
196
197
198 if os.path.exists(os.path.join(results_folder_m3,
199 f"{old_quarter_file_name}.csv")):
200 os.remove(os.path.join(
201 results_folder_m3,
202 f"{old_quarter_file_name}.csv"))
203
204 v_logger.success("Deleting old quarter file!",
205 LogMode.VERBOSELY)
206
207 if toxicity_config["need_combining"]:
208
209 CombineCSVInFolder(results_folder_kg,
210 f"{toxicity_config["combined_file_name"]}_kg")
211
212
213 MoveFileToFolder(f"{toxicity_config["combined_file_name"]}_kg.csv",
214 results_folder_kg,
215 toxicity_config["results_folder_name"])
216
217
218 CombineCSVInFolder(results_folder_m3,
219 f"{toxicity_config["combined_file_name"]}_m3")
220
221
222 MoveFileToFolder(f"{toxicity_config["combined_file_name"]}_m3.csv",
223 results_folder_m3,
224 toxicity_config["results_folder_name"])
225
226
227 if toxicity_config["delete_after_combining"] and \
228 toxicity_config["need_combining"]:
229 v_logger.info(
230 f"Deleting files after combining in "
231 f"'{toxicity_config["results_folder_name"]}'...",
232 LogMode.VERBOSELY)
233
234
235 except_items: list[str] = [
236 f"{toxicity_config["combined_file_name"]}_kg.csv",
237 f"{toxicity_config["combined_file_name"]}_m3.csv"]
238
239 molfiles_folder_name: str = toxicity_config["molfiles_folder_name"]
240
241
242 if toxicity_config["results_folder_name"] in molfiles_folder_name:
243 except_items.append(molfiles_folder_name.replace(
244 toxicity_config["results_folder_name"], "").split("/")[1])
245
246
247 DeleteFilesInFolder(toxicity_config["results_folder_name"],
248 except_items,
249 delete_folders=True)
250
251 v_logger.success(
252 f"Deleting files after combining in "
253 f"'{toxicity_config["results_folder_name"]}'!",
254 LogMode.VERBOSELY)
255
256
257 else:
258 v_logger.info(
259 f"{toxicity_config["results_file_name"]} is already "
260 f"downloaded, skip.",
261 LogMode.VERBOSELY)
262
263 if toxicity_config["filtering"]["need_filtering_by_characteristics"]:
264 v_logger.info("·", LogMode.VERBOSELY)
265
266 FilterDownloadedToxicityByCharacteristics("m3",
267 "organism",
268 "time_period",
269 "testtype")
270
271 v_logger.info()
272
273 FilterDownloadedToxicityByCharacteristics("kg",
274 "organism",
275 "route",
276 "testtype")
277
278 v_logger.success(f"{"• " * 10} PubChem downloading for DrugDesign!")
279 v_logger.info()