diff --git a/concha/utils.ipynb b/concha/utils.ipynb index c672672bf682d2af877f647b4aa5da01b4fe9542..adae05ba13e9a7a33f53e604512657394e24cc9d 100644 --- a/concha/utils.ipynb +++ b/concha/utils.ipynb @@ -25,7 +25,7 @@ "metadata": {}, "outputs": [], "source": [ - "def get_cluster_files(ensemble_path, ensemble_name, clustering_partition):\n", + "def get_cluster_files(ensemble_path, ensemble_name, labels_umap):\n", " \n", " # Initial parameters\n", " var_dict = {'multiframe' : 'n', 'check_folder' : True, 'do_xtc' : False, 'do_pdb' : False,\n", @@ -94,19 +94,19 @@ " traj_file = md.load_xtc(\"/\".join([var_dict[\"xtc_root_path\"],var_dict[\"xtc_files\"][0]]), top = \"/\".join([var_dict[\"xtc_root_path\"],var_dict[\"pdb_files\"][0]]))\n", " \n", " # Save .xtc cluster files\n", - " for k in tqdm(range(len(np.unique(clustering_partition[clustering_partition >= 0])))):\n", - " traj_file[np.where(clustering_partition == k)].save_xtc(\"/\".join([save_files, \"\".join([ensemble_name,'_',str(k),'.xtc'])]))\n", + " for k in tqdm(range(len(np.unique(labels_umap[labels_umap >= 0])))):\n", + " traj_file[np.where(labels_umap == k)].save_xtc(\"/\".join([save_files, \"\".join([ensemble_name,'_',str(k),'.xtc'])]))\n", "\n", " if var_dict[\"do_pdb\"]:\n", " \n", " conf_list = os.listdir(\"/\".join([var_dict[\"ensemble_path\"],var_dict[\"folders\"][0]]))\n", "\n", - " for k in tqdm(range(len(np.unique(clustering_partition[clustering_partition >= 0])))):\n", + " for k in tqdm(range(len(np.unique(labels_umap[labels_umap >= 0])))):\n", " clus_k_path = \"/\".join([save_path, \"_\".join(['clus',str(k)])])\n", " if not os.path.exists(clus_k_path):\n", " os.mkdir(clus_k_path)\n", " \n", - " clus_k = np.where(clustering_partition == k)[0]\n", + " clus_k = np.where(labels_umap == k)[0]\n", " for j in range(len(clus_k)):\n", " traj = md.load_pdb(\"/\".join([\"/\".join([var_dict[\"ensemble_path\"],var_dict[\"folders\"][0]]),conf_list[clus_k[j]]]))\n", " traj.save_pdb(\"/\".join([clus_k_path, \"\".join([ensemble_name,'_',str(clus_k[j]),'.pdb'])]))\n", @@ -121,9 +121,9 @@ "metadata": {}, "outputs": [], "source": [ - "def plot_2umap(embedding_2d, clustering_partition, ensemble_name, results_path):\n", + "def plot_2umap(embedding_2d, labels_umap, ensemble_name, results_path):\n", " \n", - " classified = np.where(clustering_partition >= 0)[0]\n", + " classified = np.where(labels_umap >= 0)[0]\n", " \n", " output1 = widgets.Output()\n", " with output1:\n", @@ -135,7 +135,7 @@ " alpha=0.5)\n", " scatter = ax.scatter(embedding_2d[classified, 0],\n", " embedding_2d[classified, 1],\n", - " c=clustering_partition[classified],\n", + " c=labels_umap[classified],\n", " s=0.5,\n", " alpha = 1,\n", " cmap='Spectral')\n", @@ -147,9 +147,9 @@ "\n", " output2 = widgets.Output()\n", " with output2:\n", - " repartition = pd.Series(clustering_partition).value_counts()\n", + " repartition = pd.Series(labels_umap).value_counts()\n", " repartition.index = [\"Unclassified\" if i == -1 else i for i in repartition.index]\n", - " display(pd.DataFrame({\"Cluster\" : np.array(repartition.index), \"Occupancy (%)\" : 100*np.array(repartition.values)/len(clustering_partition)}))\n", + " display(pd.DataFrame({\"Cluster\" : np.array(repartition.index), \"Occupancy (%)\" : 100*np.array(repartition.values)/len(labels_umap)}))\n", " two_columns = widgets.HBox([output1, output2])\n", " display(two_columns)" ] @@ -161,7 +161,7 @@ "metadata": {}, "outputs": [], "source": [ - "def get_wmaps(wcont_data, clustering_partition, ensemble_name, results_path):\n", + "def get_wmaps(wcont_data, labels_umap, ensemble_name, results_path):\n", " \n", " \n", " maps_path = \"/\".join([results_path,\"wcont_maps\"]) # Path to save files\n", @@ -203,7 +203,7 @@ "metadata": {}, "outputs": [], "source": [ - "def representative_ensemble(size, ensemble_path, ensemble_name, clustering_partition):\n", + "def representative_ensemble(size, ensemble_path, ensemble_name, labels_umap):\n", " \n", " # Initial parameters\n", " var_dict = {'multiframe' : 'n', 'check_folder' : True, 'do_xtc' : False, 'do_pdb' : False,\n", @@ -262,7 +262,7 @@ " print(\"\\n----------------------------------------------------------------------------------\\n\")\n", " print(\"\\nSampling representative family...\\n\")\n", " \n", - " repartition = pd.Series(clustering_partition).value_counts() # Clustering partition\n", + " repartition = pd.Series(labels_umap).value_counts() # Clustering partition\n", " repartition.index = [\"Unclassified\" if i == -1 else i for i in repartition.index]\n", " repartition = repartition.drop(\"Unclassified\")\n", " probas = repartition.values/np.sum(repartition.values)\n", @@ -271,7 +271,7 @@ " for i in range(size):\n", "\n", " choose_cluster = np.random.choice(repartition.index, size = 1, p = probas)[0]\n", - " selected_conf[i] = np.random.choice(np.where(clustering_partition == choose_cluster)[0], size = 1)[0]\n", + " selected_conf[i] = np.random.choice(np.where(labels_umap == choose_cluster)[0], size = 1)[0]\n", " \n", " selected_conf = np.ndarray.astype(selected_conf, int)\n", " results_path = \"/\".join([os.path.abspath(ensemble_path),\"_\".join(['results',ensemble_name])])\n", @@ -303,7 +303,7 @@ "metadata": {}, "outputs": [], "source": [ - "def cluster_descriptors(ensemble_path, ensemble_name, clustering_partition):\n", + "def cluster_descriptors(ensemble_path, ensemble_name, labels_umap):\n", " \n", " # Initial parameters\n", " var_dict = {'multiframe' : 'n', 'check_folder' : True, 'do_xtc' : False, 'do_pdb' : False,\n", @@ -374,16 +374,16 @@ " Nconf = traj_file.n_frames\n", " \n", " dssp_types = ['H','B','E','G','I','T','S',' ']\n", - " prop_dssp = np.zeros([len(dssp_types),L,len(clustering_partition)-1])\n", - " rg = np.zeros([len(clustering_partition)-1])\n", + " prop_dssp = np.zeros([len(dssp_types),L,len(labels_umap)-1])\n", + " rg = np.zeros([len(labels_umap)-1])\n", " \n", - " for k in range(len(np.unique(clustering_partition[clustering_partition >= 0]))):\n", + " for k in range(len(np.unique(labels_umap[labels_umap >= 0]))):\n", " \n", " prop_dssp_k = np.zeros([len(dssp_types),L])\n", - " dssp_k = md.compute_dssp(traj_file[np.where(clustering_partition == k)], simplified = False)\n", - " rg[k] = np.mean(md.compute_rg(traj_file[np.where(clustering_partition == k)]))\n", + " dssp_k = md.compute_dssp(traj_file[np.where(labels_umap == k)], simplified = False)\n", + " rg[k] = np.mean(md.compute_rg(traj_file[np.where(labels_umap == k)]))\n", " for dt in range(len(dssp_types)):\n", - " prop_dssp_k[dt,:] = (dssp_k == dssp_types[dt]).sum(axis = 0)/len(np.where(clustering_partition == k)[0])\n", + " prop_dssp_k[dt,:] = (dssp_k == dssp_types[dt]).sum(axis = 0)/len(np.where(labels_umap == k)[0])\n", " prop_dssp[:,:,k] = prop_dssp_k\n", "\n", " if var_dict[\"do_pdb\"]:\n", @@ -394,13 +394,13 @@ " Nconf = len(conf_list)\n", " \n", " dssp_types = ['H','B','E','G','I','T','S',' ']\n", - " prop_dssp = np.zeros([len(dssp_types),L,len(clustering_partition)-1])\n", - " rg = np.zeros([len(clustering_partition)-1])\n", + " prop_dssp = np.zeros([len(dssp_types),L,len(labels_umap)-1])\n", + " rg = np.zeros([len(labels_umap)-1])\n", " \n", - " for k in range(len(np.unique(clustering_partition[clustering_partition >= 0]))):\n", + " for k in range(len(np.unique(labels_umap[labels_umap >= 0]))):\n", " \n", " prop_dssp_k = np.zeros([len(dssp_types),L])\n", - " clus_k = np.where(clustering_partition == k)[0]\n", + " clus_k = np.where(labels_umap == k)[0]\n", " dssp_k = np.zeros([len(clus_k),L]).astype(str)\n", " rg_k = np.zeros([len(clus_k)])\n", "\n", @@ -409,12 +409,12 @@ " rg_k[l] = md.compute_rg(md.load_pdb(\"/\".join([pdb_folder,conf_list[clus_k[l]]])))\n", " rg[k] = np.mean(rg_k)\n", " for dt in range(len(dssp_types)):\n", - " prop_dssp_k[dt,:] = (dssp_k == dssp_types[dt]).sum(axis = 0)/len(np.where(clustering_partition == k)[0])\n", + " prop_dssp_k[dt,:] = (dssp_k == dssp_types[dt]).sum(axis = 0)/len(np.where(labels_umap == k)[0])\n", " prop_dssp[:,:,k] = prop_dssp_k\n", " \n", - " for cluster in tqdm(range(len(np.unique(clustering_partition[clustering_partition >= 0])))):\n", + " for cluster in tqdm(range(len(np.unique(labels_umap[labels_umap >= 0])))):\n", " \n", - " prop_cluster = round(100*len(np.where(clustering_partition == cluster)[0])/Nconf,2)\n", + " prop_cluster = round(100*len(np.where(labels_umap == cluster)[0])/Nconf,2)\n", " fig = plt.figure(figsize=(10, 1.7))\n", " res = sns.heatmap(prop_dssp[:,:,cluster], cmap='Blues', square = True, cbar_kws={\"shrink\": .7,'label':\"Class prop.\"})\n", " xlabels = [item.get_text() for item in res.get_xmajorticklabels()]\n",