Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update data_processing.py #3

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
119 changes: 103 additions & 16 deletions data_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,14 +36,20 @@ def get_chemblid_smiles_inchi_dict(smiles_inchi_fl):


def save_comp_imgs_from_smiles(tar_id, comp_id, smiles, rotations, target_prediction_dataset_path, SIZE=300, rot_size=300):

mol = Chem.MolFromSmiles(smiles)

if mol is None:
print(f"Invalid SMILES: {smiles}")
return
"""
Draw.DrawingOptions.atomLabelFontSize = 55

Draw.DrawingOptions.dotsPerAngstrom = 100

Draw.DrawingOptions.bondLineWidth = 1.5

"""

base_path = os.path.join(target_prediction_dataset_path, tar_id, "imgs")

if not os.path.exists(base_path):
Expand Down Expand Up @@ -88,15 +94,20 @@ def process_smiles(smiles_data):
rotations = [(0, "_0"), *[(angle, f"_{angle}") for angle in range(10, 360, 10)]]
local_dict = {test_val_train_situation: []}
try:

save_comp_imgs_from_smiles(targetid, compound_id, current_smiles, rotations, target_prediction_dataset_path)

for i in range(0,360,10):
local_dict[test_val_train_situation].append([compound_id + "_" + str(i), int(act_inact)])
if os.path.exists(os.path.join(target_prediction_dataset_path, targetid, "imgs","{}_0.png".format(compound_id))):

for i in range(0,360,10):
local_dict[test_val_train_situation].append([compound_id + "_" + str(i), int(act_inact)])
#print(local_dict[test_val_train_situation].append([compound_id + "_" + str(i), int(act_inact)]))
else:
print("( Skipped ) SMILES can not converted to image: ", compound_id)
except:

print(compound_id , targetid)
pass

return local_dict

def generate_images(smiles_file, targetid, max_cores,tar_train_val_test_dict,target_prediction_dataset_path):
Expand All @@ -108,12 +119,14 @@ def generate_images(smiles_file, targetid, max_cores,tar_train_val_test_dict,tar
test_val_train_situations = pd.read_csv(smiles_file)["test_val_train"].tolist()
smiles_data_list = [(smiles, compound_ids[i], target_prediction_dataset_path, targetid,act_inact_situations[i],test_val_train_situations[i]) for i, smiles in enumerate(smiles_list)]


start_time = time.time()

with ProcessPoolExecutor(max_workers=max_cores) as executor:
results = list(executor.map(process_smiles, smiles_data_list))
end_time = time.time()

print("result" , len(results))
for result in results:
for key, value in result.items():
tar_train_val_test_dict[key].extend(value)
Expand Down Expand Up @@ -154,9 +167,9 @@ def get_act_inact_list_for_all_targets(fl):
act_inact_dict = dict()
with open(fl) as f:
for line in f:
if line.strip() != "": # Satırın boş olmadığından emin ol
if line.strip() != "":
parts = line.strip().split("\t")
if len(parts) == 2: # Satırın doğru şekilde ayrıştırıldığından emin ol
if len(parts) == 2:
chembl_part, comps = parts
chembl_target_id, act_inact = chembl_part.split("_")
if act_inact == "act":
Expand All @@ -167,7 +180,7 @@ def get_act_inact_list_for_all_targets(fl):
act_inact_dict[chembl_target_id][1] = inact_list
return act_inact_dict

def create_act_inact_files_for_targets(fl, target_id, chembl_version, pchembl_threshold=6, target_prediction_dataset_path=None):
def create_act_inact_files_for_targets(fl, target_id, chembl_version, pchembl_threshold,scaffold, target_prediction_dataset_path=None):
# Create target directory if it doesn't exist
target_dir = os.path.join(target_prediction_dataset_path, target_id)
os.makedirs(target_dir, exist_ok=True)
Expand All @@ -179,8 +192,7 @@ def create_act_inact_files_for_targets(fl, target_id, chembl_version, pchembl_th
pre_filt_chembl_df['activity_label'] = (pre_filt_chembl_df['pchembl_value'] >= pchembl_threshold).astype(int)

# Now split the labeled data
train_ids, val_ids, test_ids = train_val_test_split(pre_filt_chembl_df, split_ratios=(0.8, 0.1, 0.1),
scaffold_split=True)
train_ids, val_ids, test_ids = train_val_test_split(pre_filt_chembl_df,scaffold ,split_ratios=(0.8, 0.1, 0.1))

# Create separate dataframes for train/val/test
train_df = pre_filt_chembl_df[pre_filt_chembl_df['molecule_chembl_id'].isin(train_ids)]
Expand Down Expand Up @@ -294,13 +306,85 @@ def create_act_inact_files_similarity_based_neg_enrichment_threshold(act_inact_f
act_inact_count_fl.close()
act_inact_comp_fl.close()

def create_final_randomized_training_val_test_sets(activity_data,max_cores,targetid,target_prediction_dataset_path, pchembl_threshold=6):
def create_final_randomized_training_val_test_sets(activity_data,max_cores,scaffold,targetid,target_prediction_dataset_path,moleculenet ,pchembl_threshold=6):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here we need to take another approach. Please first look at this code to understand how we did the moleculenet split in the SELFormer. https://github.com/HUBioDataLab/SELFormer/blob/65e686feb72185cc95f5b81176444e20586848e1/prepare_finetuning_data.py#L42

What you did is not wrong but a bit problematic. We can just choose the last column (-1) for the label for classification and regression, and we can select a slice (e.g. 1:17). Now please examine the above code and write the function again. Also you can use a flag for MoleculeNet at the main_training part to route the moleculenet datasets to the new train_test function.




if(moleculenet):

pandas_df = pd.read_csv(activity_data)

#pandas_df = pandas_df.head(200) #HERE , you can run the code for preview

print(pandas_df.columns)

if("HIV_active" in pandas_df.columns):
print("HIV")

print(pandas_df.dtypes)
pandas_df["molecule_chembl_id"] = [f"HIV{i+1}" for i in range(len(pandas_df))]
pandas_df["canonical_smiles"] = pandas_df["smiles"]
act_ids = pandas_df[pandas_df["HIV_active"] == 1]["molecule_chembl_id"].tolist()
inact_ids = pandas_df[pandas_df["HIV_active"] == 0]["molecule_chembl_id"].tolist()
act_inact_dict = {targetid: [act_ids, inact_ids]}
print("act len" , len(act_ids))
print("inact len" , len(inact_ids))

moleculenet_dict = {}
for i, row_ in pandas_df.iterrows():
cid = row_["molecule_chembl_id"]
smi = row_["canonical_smiles"]
moleculenet_dict[cid] = ["dummy1", "dummy2", "dummy3", smi]
chemblid_smiles_dict = moleculenet_dict

elif("p_np" in pandas_df.columns):

print("BBBP")
pandas_df["molecule_chembl_id"] = pandas_df["name"]
pandas_df["canonical_smiles"] = pandas_df["smiles"]
act_ids = pandas_df[pandas_df["p_np"] == 1]["molecule_chembl_id"].tolist()
inact_ids = pandas_df[pandas_df["p_np"] == 0]["molecule_chembl_id"].tolist()
act_inact_dict = {targetid: [act_ids, inact_ids]}

moleculenet_dict = {}
for i, row_ in pandas_df.iterrows():
cid = row_["molecule_chembl_id"]
smi = row_["canonical_smiles"]
moleculenet_dict[cid] = ["dummy1", "dummy2", "dummy3", smi]
chemblid_smiles_dict = moleculenet_dict

elif("mol" in pandas_df.columns):

print("BACE")

pandas_df["molecule_chembl_id"] = pandas_df["CID"]
pandas_df["canonical_smiles"] = pandas_df["mol"]
act_ids = pandas_df[pandas_df["Class"] == 1]["molecule_chembl_id"].tolist()
inact_ids = pandas_df[pandas_df["Class"] == 0]["molecule_chembl_id"].tolist()
act_inact_dict = {targetid: [act_ids, inact_ids]}

moleculenet_dict = {}
for i, row_ in pandas_df.iterrows():
cid = row_["molecule_chembl_id"]
smi = row_["canonical_smiles"]
moleculenet_dict[cid] = ["dummy1", "dummy2", "dummy3", smi]
chemblid_smiles_dict = moleculenet_dict

else:

chemblid_smiles_dict = get_chemblid_smiles_inchi_dict(activity_data)
print("Usage of different datasets")

else:

create_act_inact_files_for_targets(activity_data, targetid, "chembl", pchembl_threshold, target_prediction_dataset_path)
chemblid_smiles_dict = get_chemblid_smiles_inchi_dict(activity_data)

create_act_inact_files_for_targets(activity_data, targetid, "chembl", pchembl_threshold,scaffold, target_prediction_dataset_path)

act_inact_dict = get_act_inact_list_for_all_targets("{}/{}/{}_preprocessed_filtered_act_inact_comps_pchembl_{}.tsv".format(target_prediction_dataset_path, targetid, "chembl", pchembl_threshold))

act_inact_dict = get_act_inact_list_for_all_targets("{}/{}/{}_preprocessed_filtered_act_inact_comps_pchembl_{}.tsv".format(target_prediction_dataset_path, targetid, "chembl", pchembl_threshold))
#print(act_inact_dict)

print(len(act_inact_dict))

for tar in act_inact_dict:

Expand Down Expand Up @@ -391,6 +475,7 @@ def create_final_randomized_training_val_test_sets(activity_data,max_cores,targe

smiles_file = last_smiles_file

print("len smiles file" , len(smiles_file))
initialize_dirs(targetid , target_prediction_dataset_path)
generate_images(smiles_file , targetid , max_cores , tar_train_val_test_dict,target_prediction_dataset_path)

Expand All @@ -401,7 +486,7 @@ def create_final_randomized_training_val_test_sets(activity_data,max_cores,targe
with open(os.path.join(target_prediction_dataset_path, tar, 'train_val_test_dict.json'), 'w') as fp:
json.dump(tar_train_val_test_dict, fp)

def train_val_test_split(smiles_file, split_ratios=(0.8, 0.1, 0.1), scaffold_split=True):
def train_val_test_split(smiles_file, scaffold_split,split_ratios=(0.8, 0.1, 0.1)):
"""
Split data into train/val/test sets using either random or scaffold-based splitting

Expand Down Expand Up @@ -479,8 +564,10 @@ def __getitem__(self, index):
img_paths = [os.path.join(self.training_dataset_path, "imgs", "{}.png".format(comp_id))]


img_path = random.choice([path for path in img_paths if os.path.exists(path)])

img_path = random.choice([path for path in img_paths if os.path.exists(path)])


if not os.path.exists(img_path):
raise FileNotFoundError(f"Image not found for compound ID: {comp_id}")

Expand Down