85) -> None:
86 """
87 Фильтрует данные о токсичности из CSV-файла по заданным характеристикам,
88 загружает molfile для каждого соединения и сохраняет результаты в CSV и SDF файлы.
89
90 Args:
91 unit_type (str): тип единиц измерения (например, "kg" или "m3").
92 charact_1 (str): название первой характеристики для фильтрации.
93 charact_2 (str): название второй характеристики для фильтрации.
94 charact_3 (str): название третьей характеристики для фильтрации.
95 charact_4 (str | None): название четвёртой (опциональной) характеристики
96 для фильтрации. Defaults to None.
97 """
98
99 v_logger.info(f"Filtering by characteristics for {unit_type}...")
100
101
102 charact_folder_name: str = (
103 f"{toxicity_config['results_folder_name']}/"
104 f"{filtering_config['characteristics_subfolder_name']}"
105 )
106 os.makedirs(charact_folder_name, exist_ok=True)
107
108 unit_type_df: pd.DataFrame
109
110 try:
111
112
113 unit_type_df = pd.read_csv(
114 f"{toxicity_config['results_folder_name']}/"
115 f"{toxicity_config['combined_file_name']}_{unit_type}.csv",
116 sep=config["csv_separator"],
117 low_memory=False,
118 )
119
120 except pd.errors.EmptyDataError:
121 v_logger.warning(
122 f"{unit_type} .csv file is empty, skip filtering by characteristics."
123 )
124 return
125
126
127
128 for charact in [charact_1, charact_2, charact_3, charact_4]:
129 if charact not in unit_type_df.keys():
130 unit_type_df[charact] = np.nan
131
132 if charact == "time_period":
133 unit_type_df[charact] = unit_type_df[charact].replace(np.nan, "no_exact_time")
134
135
136 unique_charact_1 = unit_type_df[charact_1].unique()
137 unique_charact_2 = unit_type_df[charact_2].unique()
138 unique_charact_3 = unit_type_df[charact_3].unique()
139
140
141
142 unique_charact_4 = unit_type_df[charact_4].unique() if charact_4 else [None]
143
144 v_logger.info(f"Unique {charact_1}s: {unique_charact_1}.", LogMode.VERBOSELY)
145 v_logger.info(f"Unique {charact_2}s: {unique_charact_2}.", LogMode.VERBOSELY)
146 v_logger.info(f"Unique {charact_3}s: {unique_charact_3}.", LogMode.VERBOSELY)
147
148 if charact_4:
149 v_logger.info(f"Unique {charact_4}s: {unique_charact_4}.", LogMode.VERBOSELY)
150
151
152 for u_charact_1 in unique_charact_1:
153 v_logger.info("-", LogMode.VERBOSELY)
154 v_logger.info(f"Current {charact_1}: {u_charact_1}.", LogMode.VERBOSELY)
155
156 for u_charact_2 in unique_charact_2:
157 v_logger.info(f"Current {charact_2}: {u_charact_2}.", LogMode.VERBOSELY)
158
159
160 df_lvl2: pd.DataFrame = unit_type_df[
161 (unit_type_df[charact_1] == u_charact_1)
162 & (unit_type_df[charact_2] == u_charact_2)
163 ]
164
165 for u_charact_3 in unique_charact_3:
166
167 df_lvl3: pd.DataFrame = df_lvl2[df_lvl2[charact_3] == u_charact_3]
168
169 for u_charact_4 in unique_charact_4:
170
171 df_lvl4: pd.DataFrame = (
172 df_lvl3 if charact_4 is None else df_lvl3[df_lvl3[charact_4] == u_charact_4]
173 )
174
175 if df_lvl4.empty:
176 continue
177
178
179 df_lvl4 = MedianDedupedDF(df_lvl4, "cid", "dose")
180
181
182 if (
183 len(df_lvl4) >= filtering_config["occurrence_characteristics_number"]
184 and not df_lvl4.empty
185 ):
186 df_lvl4["pLD"] = -np.log10((df_lvl4["dose"] / df_lvl4["mw"]) / 1_000_000)
187
188 os.makedirs(f"{charact_folder_name}/{unit_type}", exist_ok=True)
189
190 file_suffix = f"{u_charact_1}_{u_charact_2}_{u_charact_3}"
191 if charact_4:
192 file_suffix += f"_{u_charact_4}"
193
194 filtered_file_name = (
195 f"{charact_folder_name}/{unit_type}/"
196 f"{toxicity_config['results_file_name']}_{file_suffix}"
197 )
198
199 if os.path.exists(f"{filtered_file_name}.csv") and config["skip_downloaded"]:
200 v_logger.info(
201 f"{file_suffix} is already downloaded, skip.", LogMode.VERBOSELY
202 )
203 v_logger.info("~", LogMode.VERBOSELY)
204
205 continue
206
207
208 df_lvl4.to_csv(f"{filtered_file_name}.csv", index=False)
209
210
211 if toxicity_config["download_compounds_sdf"]:
212 v_logger.info(
213 f"Saving {unit_type} characteristics to .sdf...", LogMode.VERBOSELY
214 )
215
216
217 cids: list[str] = list(df_lvl4["cid"])
218 SaveMolfilesToSDF(
219 data=pd.DataFrame({"cid": cids, "molfile": GetMolfilesFromCIDs(cids)}),
220 file_name=filtered_file_name,
221 molecule_id_column_name="cid",
222 extra_data=df_lvl4,
223 indexing_lists=True,
224 )
225
226 v_logger.success(
227 f"Saving {unit_type} characteristics to .sdf!", LogMode.VERBOSELY
228 )
229
230
231 v_logger.success(
232 f"Saved {file_suffix}, len: {len(df_lvl4)}!", LogMode.VERBOSELY
233 )
234 v_logger.info("~", LogMode.VERBOSELY)
235
236
237 v_logger.success(f"Filtering by characteristics for {unit_type}!")