Network Analysis¶
To identify which sub-networks i.e. communities, are of interest, we'll calculate the correlation between the networks and the relevant clinical metadata variables
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy
from sklearn.decomposition import PCA
from scipy.stats import pearsonr
from statsmodels.stats.multitest import multipletests
import pickle
import seaborn as sns
Read in required data
#Load in the required data
datExpr = pd.read_csv('/ReCoDE-Gene-Network-Analysis/data/data/Bcell_datExpr_pseudobulk.csv', index_col = 0)
metadata = pd.read_csv('/ReCoDE-Gene-Network-Analysis/data/data/Bcell_metadata_pseudobulk.csv', index_col = 0)
datExpr
| ISG15 | LINC01342 | TTLL10-AS1 | TNFRSF18 | CALML6 | CHD5 | ICMT-DT | MIR34AHG | RBP7 | MTOR-AS1 | ... | FRMPD3 | TSC22D3 | KLHL13 | AKAP14 | RHOXF1-AS1 | TMEM255A | SMIM10L2B-AS1 | IL9R_ENSG00000124334 | DDX3Y | EIF1AY | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| donor_id | |||||||||||||||||||||
| CH-20-001 | 6.380902 | 0.00000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 53.239480 | 0.000000 | 0.0000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 21.632603 | 17.641195 |
| CH-20-002 | 12.606751 | 2.33599 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 1.089918 | 0.000000 | 1.158743 | 1.173824 | ... | 0.000000 | 112.643970 | 0.000000 | 0.0000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 45.432410 | 22.809190 |
| CH-20-004 | 12.302510 | 0.00000 | 0.000000 | 21.512184 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 42.873410 | 0.000000 | 0.0000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 15.570595 | 20.173725 |
| CH-20-005 | 18.603716 | 1.16925 | 1.232658 | 4.975880 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 1.112746 | 190.337740 | 0.000000 | 0.0000 | 1.191559 | 0.000000 | 0.000000 | 0.000000 | 6.931139 | 1.071742 |
| CH-21-002 | 13.705297 | 0.00000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 44.942260 | 0.000000 | 0.0000 | 0.000000 | 0.000000 | 0.000000 | 1.323198 | 0.000000 | 0.000000 |
| CH-21-006 | 4.377715 | 0.00000 | 0.000000 | 23.782143 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 1.023552 | 0.000000 | ... | 0.000000 | 12.741602 | 0.000000 | 0.0000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 5.349793 | 12.981407 |
| CH-21-008 | 18.058025 | 0.00000 | 0.000000 | 44.614340 | 0.00000 | 1.201673 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 76.893720 | 0.000000 | 0.0000 | 0.000000 | 0.000000 | 0.000000 | 1.080360 | 1.188176 | 2.377049 |
| CH-21-013 | 21.395964 | 0.00000 | 0.000000 | 30.426510 | 0.00000 | 0.000000 | 1.235703 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 54.458330 | 0.000000 | 0.0000 | 0.000000 | 0.000000 | 1.117969 | 1.236817 | 23.543072 | 53.250420 |
| CH-21-014 | 13.436963 | 0.00000 | 0.000000 | 11.067089 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 32.248600 | 0.000000 | 0.0000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 17.709280 | 21.636190 |
| CH-21-017 | 22.916807 | 0.00000 | 0.000000 | 9.076924 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 2.478934 | 0.000000 | ... | 0.000000 | 188.600070 | 0.000000 | 0.0000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 49.114600 | 40.454937 |
| CH-21-020 | 197.794700 | 0.00000 | 0.000000 | 122.788270 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 1.047435 | 0.000000 | ... | 0.000000 | 197.616580 | 0.000000 | 0.0000 | 0.000000 | 0.000000 | 0.000000 | 0.765914 | 88.202680 | 173.938080 |
| CH-21-021 | 13.898113 | 0.00000 | 0.000000 | 11.169237 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 20.431047 | 0.000000 | 0.0000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 11.017612 | 21.158054 |
| CH-21-028 | 7.210576 | 0.00000 | 1.066841 | 1.321003 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 57.428060 | 0.000000 | 0.0000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 2.989963 | 0.000000 |
| CH-21-029 | 9.007506 | 0.00000 | 0.000000 | 1.928462 | 1.21185 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 157.941900 | 0.000000 | 0.0000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 3.745571 | 2.051687 |
| CH-21-031 | 30.211197 | 0.00000 | 0.000000 | 40.325450 | 0.00000 | 0.000000 | 0.000000 | 2.130981 | 0.000000 | 0.000000 | ... | 1.244156 | 12.550498 | 0.000000 | 0.0000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.227465 | 0.906813 |
| CH-21-033 | 21.972580 | 0.00000 | 0.000000 | 84.504500 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 152.658000 | 1.167664 | 1.2505 | 0.000000 | 1.199426 | 0.000000 | 0.000000 | 45.661453 | 142.600740 |
| CH-21-034 | 54.934030 | 0.00000 | 0.000000 | 147.552780 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.886594 | 167.975900 | 0.000000 | 0.0000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.075679 |
| CH-21-036 | 17.018766 | 0.00000 | 0.000000 | 2.483573 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 88.924920 | 0.000000 | 0.0000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 39.051266 | 10.004631 |
| CH-21-037 | 150.473450 | 0.00000 | 0.000000 | 53.255013 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 38.325650 | 0.000000 | 0.0000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 33.786545 | 58.778214 |
| CH-21-046 | 9.337872 | 0.00000 | 0.000000 | 28.949800 | 0.00000 | 1.123670 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 28.600826 | 0.000000 | 0.0000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 9.238980 | 12.119887 |
| CH-21-073 | 4.982193 | 0.00000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 40.201653 | 0.000000 | 0.0000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 27.179262 | 2.954510 |
| CH-21-074 | 3.954194 | 0.00000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 1.127058 | 0.000000 | 0.000000 | ... | 0.000000 | 18.354240 | 0.000000 | 0.0000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 6.401694 | 2.069999 |
| CH-21-077 | 33.969110 | 0.00000 | 0.000000 | 3.333775 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 1.115637 | 0.000000 | ... | 0.000000 | 161.007570 | 0.000000 | 0.0000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.646068 | 0.000000 |
| CH-21-079 | 7.030363 | 0.00000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 40.449474 | 0.000000 | 0.0000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 18.332941 | 11.980942 |
24 rows × 1000 columns
metadata
| nCount_RNA | nFeature_RNA | donor_id.1 | MUTATION | percent.mt | scType_celltype | tissue_type | cell_type | tissue | development_stage | male | female | CH | normal | DNMT3A | TET2 | NoMutation | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| donor_id | |||||||||||||||||
| CH-20-001 | 2490.0 | 1403 | CH-20-001 | DNMT3A R882C | 6.119578 | Naive B cells | tissue | B cell | blood | 60 | 1 | 0 | 1 | 0 | 1 | 0 | 0 |
| CH-20-002 | 1192.0 | 629 | CH-20-002 | DNMT3A R729W (4%), DNMT3A R736C (2%) | 3.803975 | Naive B cells | tissue | B cell | blood | 68 | 1 | 0 | 1 | 0 | 1 | 0 | 0 |
| CH-20-004 | 1833.0 | 985 | CH-20-004 | TET2 R1516X (30%), TET2 Q659X (29%), SRSF2 P95... | 5.335196 | Naive B cells | tissue | B cell | blood | 85 | 1 | 0 | 1 | 0 | 0 | 1 | 0 |
| CH-20-005 | 1966.0 | 886 | CH-20-005 | TET2 V1900F (2%) | 5.314136 | Naive B cells | tissue | B cell | blood | 58 | 0 | 1 | 1 | 0 | 0 | 1 | 0 |
| CH-21-002 | 1912.0 | 938 | CH-21-002 | none | 5.657238 | Naive B cells | tissue | B cell | blood | 48 | 0 | 1 | 0 | 1 | 0 | 0 | 1 |
| CH-21-006 | 1356.0 | 709 | CH-21-006 | DNMT3A R882H (13%) | 5.211849 | Naive B cells | tissue | B cell | blood | 67 | 0 | 1 | 1 | 0 | 1 | 0 | 0 |
| CH-21-008 | 1117.0 | 575 | CH-21-008 | none | 8.398348 | Naive B cells | tissue | B cell | blood | 70 | 0 | 1 | 0 | 1 | 0 | 0 | 1 |
| CH-21-013 | 1321.0 | 816 | CH-21-013 | none | 4.663212 | Naive B cells | tissue | B cell | blood | 73 | 1 | 0 | 0 | 1 | 0 | 0 | 1 |
| CH-21-014 | 1064.0 | 623 | CH-21-014 | SRSF2 P95R (40%), TET2 L957Ifs*15 (51%) | 4.146577 | Naive B cells | tissue | B cell | blood | 74 | 1 | 0 | 1 | 0 | 0 | 1 | 0 |
| CH-21-017 | 1880.0 | 953 | CH-21-017 | DNMT3A R882H (20%), IDH2 R140Q (10%), TP53 R27... | 6.519922 | Naive B cells | tissue | B cell | blood | 65 | 1 | 0 | 1 | 0 | 1 | 0 | 0 |
| CH-21-020 | 5325.0 | 2286 | CH-21-020 | none | 5.631046 | Naive B cells | tissue | B cell | blood | 61 | 1 | 0 | 0 | 1 | 0 | 0 | 1 |
| CH-21-021 | 1671.0 | 943 | CH-21-021 | none | 3.214286 | Naive B cells | tissue | B cell | blood | 83 | 1 | 0 | 0 | 1 | 0 | 0 | 1 |
| CH-21-028 | 1690.0 | 866 | CH-21-028 | none | 6.053894 | Naive B cells | tissue | B cell | blood | 89 | 0 | 1 | 0 | 1 | 0 | 0 | 1 |
| CH-21-029 | 2180.0 | 1073 | CH-21-029 | TET2 G68X (2%) | 2.570194 | Naive B cells | tissue | B cell | blood | 83 | 0 | 1 | 1 | 0 | 0 | 1 | 0 |
| CH-21-031 | 1592.0 | 887 | CH-21-031 | none | 6.734398 | Naive B cells | tissue | B cell | blood | 78 | 0 | 1 | 0 | 1 | 0 | 0 | 1 |
| CH-21-033 | 2219.0 | 1138 | CH-21-033 | TET2 (33%) | 5.670567 | Naive B cells | tissue | B cell | blood | 81 | 1 | 0 | 1 | 0 | 0 | 1 | 0 |
| CH-21-034 | 2010.0 | 974 | CH-21-034 | DNMT3A Q816X (8%) | 7.937365 | Naive B cells | tissue | B cell | blood | 39 | 0 | 1 | 1 | 0 | 1 | 0 | 0 |
| CH-21-036 | 2686.0 | 1337 | CH-21-036 | DNMT3A splice (7%) | 3.909544 | Naive B cells | tissue | B cell | blood | 91 | 1 | 0 | 1 | 0 | 1 | 0 | 0 |
| CH-21-037 | 3546.0 | 1645 | CH-21-037 | TET2 (6.2%) | 4.473764 | Naive B cells | tissue | B cell | blood | 71 | 1 | 0 | 1 | 0 | 0 | 1 | 0 |
| CH-21-046 | 1918.0 | 907 | CH-21-046 | DNMT3A W305X (24%) | 4.807084 | Naive B cells | tissue | B cell | blood | 80 | 1 | 0 | 1 | 0 | 1 | 0 | 0 |
| CH-21-073 | 2148.0 | 1096 | CH-21-073 | SRSF2 (33%), TET2 Y1245Lfs*22 (27%), TET2 Q742... | 5.174489 | Naive B cells | tissue | B cell | blood | 77 | 1 | 0 | 1 | 0 | 0 | 1 | 0 |
| CH-21-074 | 1322.0 | 708 | CH-21-074 | TET2 C1378Y (23%) | 3.328561 | Naive B cells | tissue | B cell | blood | 70 | 1 | 0 | 1 | 0 | 0 | 1 | 0 |
| CH-21-077 | 1715.0 | 934 | CH-21-077 | DNMT3A R749C (9.1%) | 6.539510 | Naive B cells | tissue | B cell | blood | 50 | 0 | 1 | 1 | 0 | 1 | 0 | 0 |
| CH-21-079 | 1354.0 | 793 | CH-21-079 | DNMT3A M880V (5%) | 6.386293 | Naive B cells | tissue | B cell | blood | 78 | 1 | 0 | 1 | 0 | 1 | 0 | 0 |
gene_names = datExpr.columns
gene_names
Index(['ISG15', 'LINC01342', 'TTLL10-AS1', 'TNFRSF18', 'CALML6', 'CHD5',
'ICMT-DT', 'MIR34AHG', 'RBP7', 'MTOR-AS1',
...
'FRMPD3', 'TSC22D3', 'KLHL13', 'AKAP14', 'RHOXF1-AS1', 'TMEM255A',
'SMIM10L2B-AS1', 'IL9R_ENSG00000124334', 'DDX3Y', 'EIF1AY'],
dtype='object', length=1000)
with open('/ReCoDE-Gene-Network-Analysis/data/other/separated_communities.pkl', 'rb') as file:
separated_communities = pickle.load(file)
print(separated_communities)
[['NKX6-3', 'ZC2HC1B', 'ZSCAN10', 'HCG9', 'CC2D2B', 'DBX2', 'MOBP', 'STAP2', 'LCN2', 'ZNF462', 'TMEM72-AS1', 'LNCOC1', 'SLC28A2', 'SLC10A5', 'SRRM5', 'SLC17A7', 'OPA1-AS1', 'DEPDC7', 'LIN28A', 'SLC6A3', 'GJB6', 'TMEM255A', 'IGLC4', 'LINC00640', 'DENND6A-AS1', 'FNDC5', 'LAMA2', 'CYYR1', 'CNKSR3', 'RNF182', 'TECTA', 'KRT2', 'PRRT3-AS1', 'LINC02267', 'MUCL3', 'CFAP99', 'CCNA1', 'SLC26A8', 'GNG12', 'PHACTR3', 'LINC01133', 'C21orf62', 'ANKRD29', 'NGFR', 'NKD2', 'GJC2', 'LIM2', 'ANO1', 'LINC00997', 'STARD6', 'ARHGAP23', 'HYDIN', 'KLHL13', 'IGLV5-37', 'PLSCR2', 'TDRP', 'REG1A', 'LINC01832', 'ARHGEF39', 'CDH6', 'LINC01891', 'MGAM', 'AKAP14', 'TDRD1', 'PRLR', 'RARRES2', 'SLC2A4', 'OR2B11', 'DTNA', 'PTGES'], ['ACTN3', 'PDK4-AS1', 'C10orf105', 'ACVR2B-AS1', 'LINC02348', 'DPYD-IT1', 'UBE2Q2P16', 'PRKD3-DT', 'LINC01986', 'CALML6', 'EPHB2', 'PAPPA-AS1', 'DLGAP2', 'MROH8', 'RAB6C', 'ZNF503', 'H3C8', 'LINC02569', 'CDC42EP1', 'TFAP2A', 'ESCO2', 'WNT3A', 'SLC12A3', 'RGPD6', 'ICA1-AS1', 'FOXD3', 'LINC02615', 'FAM174A-DT'], ['IGHV3-32', 'LINC01050', 'STON2', 'CLGN', 'CCK', 'RGL3', 'COPDA1', 'PPP1R9A-AS1', 'SPDYE21', 'BSN', 'PRR15', 'IGKV3D-11', 'C12orf71', 'SMIM17', 'FAM20C', 'KCNQ4', 'PGAM2', 'DAGLA', 'NKX6-2', 'REEP1', 'LGALSL-DT', 'ZNF32-AS2', 'ZSCAN2-AS1', 'TWIST2', 'LINC01503', 'CADM4', 'DBH-AS1'], ['MAG', 'MOCS1', 'ALDH8A1', 'MTOR-AS1', 'MEX3B', 'MYO3B', 'GLP2R', 'SCT', 'TAF1L', 'IGHV3-73', 'OR2AK2', 'PTPRD-AS1', 'ST8SIA5', 'PITPNM2-AS1', 'EPHA2', 'TKTL2', 'SLC5A5', 'PCSK1', 'CUX2', 'IGHV3-69-1', 'ZNF474'], ['LINC02880', 'COL11A2', 'SCARF2', 'ADGRG3', 'ZNF491', 'CFAP141', 'FSTL1', 'TRBJ2-4', 'SMIM10L2B-AS1', 'UBL4B', 'CLEC1B', 'DPYSL4', 'LDHAL6A', 'PRUNE2', 'LINC00200', 'BRME1', 'FAM186B', 'VWA5B1', 'RARRES1', 'LINC01108'], ['SLC1A2-AS1', 'MIR130AHG', 'HEATR4', 'SLC49A3', 'OR6C6', 'SLC4A9', 'IGKV1D-13', 'ACTL7B', 'LINC01823', 'MIPOL1', 'AMPD1', 'IGKV2D-30', 'SLC16A14', 'DEPTOR-AS1', 'EDNRB-AS1', 'H3C12', 'LINC01990', 'ATP8A1-DT', 'TGFB2-AS1'], ['NLRP6', 'PRSS45P', 'NAALADL2', 'VSTM1', 'MYLK3', 'FOXI1', 'PCDHGA3', 'VWA7', 'HDHD5-AS1', 'LINC03021_ENSG00000254319', 'LINC01579', 'IGHV1-14', 'HCG22', 'LINC02055', 'UNC79', 'RNF217-AS1', 'C1orf50-AS1', 'GYS2'], ['CASC9', 'CCDC28A-AS1', 'TRPM1', 'CYP2S1', 'TMPRSS3', 'THTPA', 'GCNT3', 'FREY1', 'RHOXF1-AS1', 'NHS', 'IGHV7-4-1', 'SLITRK5', 'IGLV5-45', 'TRMT9B', 'CFAP57', 'RNF112', 'LINC00654'], ['ASIC3', 'SLC22A1', 'BHLHE22', 'LIF', 'S100A16', 'CT69', 'TRAV29DV5', 'C10orf71', 'TTLL7', 'MS4A4E', 'DNMBP-AS1', 'EDAR'], ['FAM182A', 'PACRG', 'NKX6-1', 'ACTA2-AS1', 'TAS2R39', 'CHRNG', 'LINC02660', 'CDH23'], ['TEKTIP1', 'ASGR2', 'IGKV2-26', 'LINC02057', 'GOLGA8H', 'LRRC49'], ['CYP2C8', 'PELP1-DT', 'LINC01301', 'CBY3', 'PTGR1'], ['KCNH6', 'LINC01980', 'DLG1-AS1', 'IQSEC2', 'LPAR4'], ['MYO3A', 'HSD17B2-AS1', 'DUOX2', 'LINC01115_ENSG00000237667'], ['FCN3', 'PCDHB4', 'S100A5'], ['USP43', 'IGFBP2', 'CNTNAP3'], ['NPM2', 'RHBDF1'], ['STXBP6', 'SERTAD3-AS1']]
Step 1: Module Eigengene Calculation¶
Module eigengene calculation is a concept used in the analysis of gene expression data.
Module/sub-network: A module refers to a group of genes that exhibit similar expression patterns across samples. Modules are often identified using clustering algorithms applied to gene expression data.
Eigengene: An eigengene represents the overall expression profile of a module. It is calculated as the first principal component of the gene expression profiles within the module. Essentially, the eigengene captures the main axis of variation or the common expression pattern shared by the genes within the module.
We will be using module eigengenes as representations for further downstream analysis.
# Initialise a DataFrame to store module eigengenes
module_eigengenes = pd.DataFrame(index=datExpr.index)
# Calculate the module eigengene for each community
for i, community in enumerate(separated_communities):
community_genes = [gene for gene in community if gene in gene_names]
if community_genes:
community_expr = datExpr[community_genes]
pca = PCA(n_components=1)
eigengene = pca.fit_transform(community_expr)
module_eigengenes[f'Module_{i+1}'] = eigengene[:, 0]
metadata
| nCount_RNA | nFeature_RNA | donor_id.1 | MUTATION | percent.mt | scType_celltype | tissue_type | cell_type | tissue | development_stage | male | female | CH | normal | DNMT3A | TET2 | NoMutation | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| donor_id | |||||||||||||||||
| CH-20-001 | 2490.0 | 1403 | CH-20-001 | DNMT3A R882C | 6.119578 | Naive B cells | tissue | B cell | blood | 60 | 1 | 0 | 1 | 0 | 1 | 0 | 0 |
| CH-20-002 | 1192.0 | 629 | CH-20-002 | DNMT3A R729W (4%), DNMT3A R736C (2%) | 3.803975 | Naive B cells | tissue | B cell | blood | 68 | 1 | 0 | 1 | 0 | 1 | 0 | 0 |
| CH-20-004 | 1833.0 | 985 | CH-20-004 | TET2 R1516X (30%), TET2 Q659X (29%), SRSF2 P95... | 5.335196 | Naive B cells | tissue | B cell | blood | 85 | 1 | 0 | 1 | 0 | 0 | 1 | 0 |
| CH-20-005 | 1966.0 | 886 | CH-20-005 | TET2 V1900F (2%) | 5.314136 | Naive B cells | tissue | B cell | blood | 58 | 0 | 1 | 1 | 0 | 0 | 1 | 0 |
| CH-21-002 | 1912.0 | 938 | CH-21-002 | none | 5.657238 | Naive B cells | tissue | B cell | blood | 48 | 0 | 1 | 0 | 1 | 0 | 0 | 1 |
| CH-21-006 | 1356.0 | 709 | CH-21-006 | DNMT3A R882H (13%) | 5.211849 | Naive B cells | tissue | B cell | blood | 67 | 0 | 1 | 1 | 0 | 1 | 0 | 0 |
| CH-21-008 | 1117.0 | 575 | CH-21-008 | none | 8.398348 | Naive B cells | tissue | B cell | blood | 70 | 0 | 1 | 0 | 1 | 0 | 0 | 1 |
| CH-21-013 | 1321.0 | 816 | CH-21-013 | none | 4.663212 | Naive B cells | tissue | B cell | blood | 73 | 1 | 0 | 0 | 1 | 0 | 0 | 1 |
| CH-21-014 | 1064.0 | 623 | CH-21-014 | SRSF2 P95R (40%), TET2 L957Ifs*15 (51%) | 4.146577 | Naive B cells | tissue | B cell | blood | 74 | 1 | 0 | 1 | 0 | 0 | 1 | 0 |
| CH-21-017 | 1880.0 | 953 | CH-21-017 | DNMT3A R882H (20%), IDH2 R140Q (10%), TP53 R27... | 6.519922 | Naive B cells | tissue | B cell | blood | 65 | 1 | 0 | 1 | 0 | 1 | 0 | 0 |
| CH-21-020 | 5325.0 | 2286 | CH-21-020 | none | 5.631046 | Naive B cells | tissue | B cell | blood | 61 | 1 | 0 | 0 | 1 | 0 | 0 | 1 |
| CH-21-021 | 1671.0 | 943 | CH-21-021 | none | 3.214286 | Naive B cells | tissue | B cell | blood | 83 | 1 | 0 | 0 | 1 | 0 | 0 | 1 |
| CH-21-028 | 1690.0 | 866 | CH-21-028 | none | 6.053894 | Naive B cells | tissue | B cell | blood | 89 | 0 | 1 | 0 | 1 | 0 | 0 | 1 |
| CH-21-029 | 2180.0 | 1073 | CH-21-029 | TET2 G68X (2%) | 2.570194 | Naive B cells | tissue | B cell | blood | 83 | 0 | 1 | 1 | 0 | 0 | 1 | 0 |
| CH-21-031 | 1592.0 | 887 | CH-21-031 | none | 6.734398 | Naive B cells | tissue | B cell | blood | 78 | 0 | 1 | 0 | 1 | 0 | 0 | 1 |
| CH-21-033 | 2219.0 | 1138 | CH-21-033 | TET2 (33%) | 5.670567 | Naive B cells | tissue | B cell | blood | 81 | 1 | 0 | 1 | 0 | 0 | 1 | 0 |
| CH-21-034 | 2010.0 | 974 | CH-21-034 | DNMT3A Q816X (8%) | 7.937365 | Naive B cells | tissue | B cell | blood | 39 | 0 | 1 | 1 | 0 | 1 | 0 | 0 |
| CH-21-036 | 2686.0 | 1337 | CH-21-036 | DNMT3A splice (7%) | 3.909544 | Naive B cells | tissue | B cell | blood | 91 | 1 | 0 | 1 | 0 | 1 | 0 | 0 |
| CH-21-037 | 3546.0 | 1645 | CH-21-037 | TET2 (6.2%) | 4.473764 | Naive B cells | tissue | B cell | blood | 71 | 1 | 0 | 1 | 0 | 0 | 1 | 0 |
| CH-21-046 | 1918.0 | 907 | CH-21-046 | DNMT3A W305X (24%) | 4.807084 | Naive B cells | tissue | B cell | blood | 80 | 1 | 0 | 1 | 0 | 1 | 0 | 0 |
| CH-21-073 | 2148.0 | 1096 | CH-21-073 | SRSF2 (33%), TET2 Y1245Lfs*22 (27%), TET2 Q742... | 5.174489 | Naive B cells | tissue | B cell | blood | 77 | 1 | 0 | 1 | 0 | 0 | 1 | 0 |
| CH-21-074 | 1322.0 | 708 | CH-21-074 | TET2 C1378Y (23%) | 3.328561 | Naive B cells | tissue | B cell | blood | 70 | 1 | 0 | 1 | 0 | 0 | 1 | 0 |
| CH-21-077 | 1715.0 | 934 | CH-21-077 | DNMT3A R749C (9.1%) | 6.539510 | Naive B cells | tissue | B cell | blood | 50 | 0 | 1 | 1 | 0 | 1 | 0 | 0 |
| CH-21-079 | 1354.0 | 793 | CH-21-079 | DNMT3A M880V (5%) | 6.386293 | Naive B cells | tissue | B cell | blood | 78 | 1 | 0 | 1 | 0 | 1 | 0 | 0 |
Lets tidy up the metadata dataframe further to only the columns needed for this analysis.
metadata2 = metadata.drop(columns=['donor_id.1','scType_celltype', 'tissue_type','cell_type', 'tissue', 'MUTATION'])
metadata2
| nCount_RNA | nFeature_RNA | percent.mt | development_stage | male | female | CH | normal | DNMT3A | TET2 | NoMutation | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| donor_id | |||||||||||
| CH-20-001 | 2490.0 | 1403 | 6.119578 | 60 | 1 | 0 | 1 | 0 | 1 | 0 | 0 |
| CH-20-002 | 1192.0 | 629 | 3.803975 | 68 | 1 | 0 | 1 | 0 | 1 | 0 | 0 |
| CH-20-004 | 1833.0 | 985 | 5.335196 | 85 | 1 | 0 | 1 | 0 | 0 | 1 | 0 |
| CH-20-005 | 1966.0 | 886 | 5.314136 | 58 | 0 | 1 | 1 | 0 | 0 | 1 | 0 |
| CH-21-002 | 1912.0 | 938 | 5.657238 | 48 | 0 | 1 | 0 | 1 | 0 | 0 | 1 |
| CH-21-006 | 1356.0 | 709 | 5.211849 | 67 | 0 | 1 | 1 | 0 | 1 | 0 | 0 |
| CH-21-008 | 1117.0 | 575 | 8.398348 | 70 | 0 | 1 | 0 | 1 | 0 | 0 | 1 |
| CH-21-013 | 1321.0 | 816 | 4.663212 | 73 | 1 | 0 | 0 | 1 | 0 | 0 | 1 |
| CH-21-014 | 1064.0 | 623 | 4.146577 | 74 | 1 | 0 | 1 | 0 | 0 | 1 | 0 |
| CH-21-017 | 1880.0 | 953 | 6.519922 | 65 | 1 | 0 | 1 | 0 | 1 | 0 | 0 |
| CH-21-020 | 5325.0 | 2286 | 5.631046 | 61 | 1 | 0 | 0 | 1 | 0 | 0 | 1 |
| CH-21-021 | 1671.0 | 943 | 3.214286 | 83 | 1 | 0 | 0 | 1 | 0 | 0 | 1 |
| CH-21-028 | 1690.0 | 866 | 6.053894 | 89 | 0 | 1 | 0 | 1 | 0 | 0 | 1 |
| CH-21-029 | 2180.0 | 1073 | 2.570194 | 83 | 0 | 1 | 1 | 0 | 0 | 1 | 0 |
| CH-21-031 | 1592.0 | 887 | 6.734398 | 78 | 0 | 1 | 0 | 1 | 0 | 0 | 1 |
| CH-21-033 | 2219.0 | 1138 | 5.670567 | 81 | 1 | 0 | 1 | 0 | 0 | 1 | 0 |
| CH-21-034 | 2010.0 | 974 | 7.937365 | 39 | 0 | 1 | 1 | 0 | 1 | 0 | 0 |
| CH-21-036 | 2686.0 | 1337 | 3.909544 | 91 | 1 | 0 | 1 | 0 | 1 | 0 | 0 |
| CH-21-037 | 3546.0 | 1645 | 4.473764 | 71 | 1 | 0 | 1 | 0 | 0 | 1 | 0 |
| CH-21-046 | 1918.0 | 907 | 4.807084 | 80 | 1 | 0 | 1 | 0 | 1 | 0 | 0 |
| CH-21-073 | 2148.0 | 1096 | 5.174489 | 77 | 1 | 0 | 1 | 0 | 0 | 1 | 0 |
| CH-21-074 | 1322.0 | 708 | 3.328561 | 70 | 1 | 0 | 1 | 0 | 0 | 1 | 0 |
| CH-21-077 | 1715.0 | 934 | 6.539510 | 50 | 0 | 1 | 1 | 0 | 1 | 0 | 0 |
| CH-21-079 | 1354.0 | 793 | 6.386293 | 78 | 1 | 0 | 1 | 0 | 1 | 0 | 0 |
# Merge module eigengenes with metadata
merged_data = pd.concat([metadata2, module_eigengenes], axis=1)
merged_data
| nCount_RNA | nFeature_RNA | percent.mt | development_stage | male | female | CH | normal | DNMT3A | TET2 | ... | Module_9 | Module_10 | Module_11 | Module_12 | Module_13 | Module_14 | Module_15 | Module_16 | Module_17 | Module_18 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| donor_id | |||||||||||||||||||||
| CH-20-001 | 2490.0 | 1403 | 6.119578 | 60 | 1 | 0 | 1 | 0 | 1 | 0 | ... | -0.188323 | -0.138498 | -0.119332 | -0.107626 | -0.108107 | -0.094242 | -0.084411 | -0.085366 | -0.066741 | -0.070642 |
| CH-20-002 | 1192.0 | 629 | 3.803975 | 68 | 1 | 0 | 1 | 0 | 1 | 0 | ... | -0.188323 | -0.138498 | -0.119332 | -0.107626 | -0.108107 | -0.094242 | -0.084411 | -0.085366 | -0.066741 | -0.070642 |
| CH-20-004 | 1833.0 | 985 | 5.335196 | 85 | 1 | 0 | 1 | 0 | 0 | 1 | ... | -0.188323 | -0.138498 | -0.119332 | -0.107626 | -0.108107 | -0.094242 | -0.084411 | -0.085366 | -0.066741 | -0.070642 |
| CH-20-005 | 1966.0 | 886 | 5.314136 | 58 | 0 | 1 | 1 | 0 | 0 | 1 | ... | -0.188323 | -0.138498 | -0.119332 | -0.107626 | -0.108107 | -0.094242 | -0.084411 | -0.085366 | -0.066741 | -0.070642 |
| CH-21-002 | 1912.0 | 938 | 5.657238 | 48 | 0 | 1 | 0 | 1 | 0 | 0 | ... | -0.188323 | -0.138498 | -0.119332 | -0.107626 | -0.108107 | -0.094242 | -0.084411 | -0.085366 | -0.066741 | -0.070642 |
| CH-21-006 | 1356.0 | 709 | 5.211849 | 67 | 0 | 1 | 1 | 0 | 1 | 0 | ... | -0.188323 | -0.138498 | -0.119332 | -0.107626 | -0.108107 | -0.094242 | -0.084411 | -0.085366 | -0.066741 | -0.070642 |
| CH-21-008 | 1117.0 | 575 | 8.398348 | 70 | 0 | 1 | 0 | 1 | 0 | 0 | ... | -0.188323 | -0.138498 | -0.119332 | -0.107626 | -0.108107 | -0.094242 | -0.084411 | -0.085366 | -0.066741 | -0.070642 |
| CH-21-013 | 1321.0 | 816 | 4.663212 | 73 | 1 | 0 | 0 | 1 | 0 | 0 | ... | -0.188323 | -0.138498 | -0.119332 | -0.107626 | -0.108107 | -0.094242 | -0.084411 | -0.085366 | -0.066741 | -0.070642 |
| CH-21-014 | 1064.0 | 623 | 4.146577 | 74 | 1 | 0 | 1 | 0 | 0 | 1 | ... | -0.188323 | -0.138498 | 2.744634 | -0.107626 | -0.108107 | -0.094242 | -0.084411 | -0.085366 | -0.066741 | -0.070642 |
| CH-21-017 | 1880.0 | 953 | 6.519922 | 65 | 1 | 0 | 1 | 0 | 1 | 0 | ... | -0.188323 | -0.138498 | -0.119332 | -0.107626 | -0.108107 | -0.094242 | -0.084411 | -0.085366 | -0.066741 | -0.070642 |
| CH-21-020 | 5325.0 | 2286 | 5.631046 | 61 | 1 | 0 | 0 | 1 | 0 | 0 | ... | 4.331426 | -0.138498 | -0.119332 | -0.107626 | -0.108107 | -0.094242 | -0.084411 | -0.085366 | -0.066741 | -0.070642 |
| CH-21-021 | 1671.0 | 943 | 3.214286 | 83 | 1 | 0 | 0 | 1 | 0 | 0 | ... | -0.188323 | -0.138498 | -0.119332 | 2.475387 | -0.108107 | -0.094242 | -0.084411 | -0.085366 | -0.066741 | -0.070642 |
| CH-21-028 | 1690.0 | 866 | 6.053894 | 89 | 0 | 1 | 0 | 1 | 0 | 0 | ... | -0.188323 | -0.138498 | -0.119332 | -0.107626 | 2.486453 | -0.094242 | -0.084411 | -0.085366 | -0.066741 | -0.070642 |
| CH-21-029 | 2180.0 | 1073 | 2.570194 | 83 | 0 | 1 | 1 | 0 | 0 | 1 | ... | -0.188323 | -0.138498 | -0.119332 | -0.107626 | -0.108107 | -0.094242 | -0.084411 | -0.085366 | -0.066741 | -0.070642 |
| CH-21-031 | 1592.0 | 887 | 6.734398 | 78 | 0 | 1 | 0 | 1 | 0 | 0 | ... | -0.188323 | -0.138498 | -0.119332 | -0.107626 | -0.108107 | -0.094242 | -0.084411 | -0.085366 | 1.535032 | -0.070642 |
| CH-21-033 | 2219.0 | 1138 | 5.670567 | 81 | 1 | 0 | 1 | 0 | 0 | 1 | ... | -0.188323 | -0.138498 | -0.119332 | -0.107626 | -0.108107 | -0.094242 | -0.084411 | -0.085366 | -0.066741 | -0.070642 |
| CH-21-034 | 2010.0 | 974 | 7.937365 | 39 | 0 | 1 | 1 | 0 | 1 | 0 | ... | -0.188323 | -0.138498 | -0.119332 | -0.107626 | -0.108107 | -0.094242 | -0.084411 | -0.085366 | -0.066741 | -0.070642 |
| CH-21-036 | 2686.0 | 1337 | 3.909544 | 91 | 1 | 0 | 1 | 0 | 1 | 0 | ... | -0.188323 | -0.138498 | -0.119332 | -0.107626 | -0.108107 | 2.167574 | 1.941447 | -0.085366 | -0.066741 | -0.070642 |
| CH-21-037 | 3546.0 | 1645 | 4.473764 | 71 | 1 | 0 | 1 | 0 | 0 | 1 | ... | -0.188323 | -0.138498 | -0.119332 | -0.107626 | -0.108107 | -0.094242 | -0.084411 | 1.963424 | -0.066741 | -0.070642 |
| CH-21-046 | 1918.0 | 907 | 4.807084 | 80 | 1 | 0 | 1 | 0 | 1 | 0 | ... | -0.188323 | -0.138498 | -0.119332 | -0.107626 | -0.108107 | -0.094242 | -0.084411 | -0.085366 | -0.066741 | -0.070642 |
| CH-21-073 | 2148.0 | 1096 | 5.174489 | 77 | 1 | 0 | 1 | 0 | 0 | 1 | ... | -0.188323 | -0.138498 | -0.119332 | -0.107626 | -0.108107 | -0.094242 | -0.084411 | -0.085366 | -0.066741 | -0.070642 |
| CH-21-074 | 1322.0 | 708 | 3.328561 | 70 | 1 | 0 | 1 | 0 | 0 | 1 | ... | -0.188323 | -0.138498 | -0.119332 | -0.107626 | -0.108107 | -0.094242 | -0.084411 | -0.085366 | -0.066741 | 1.624761 |
| CH-21-077 | 1715.0 | 934 | 6.539510 | 50 | 0 | 1 | 1 | 0 | 1 | 0 | ... | -0.188323 | 3.185443 | -0.119332 | -0.107626 | -0.108107 | -0.094242 | -0.084411 | -0.085366 | -0.066741 | -0.070642 |
| CH-21-079 | 1354.0 | 793 | 6.386293 | 78 | 1 | 0 | 1 | 0 | 1 | 0 | ... | -0.188323 | -0.138498 | -0.119332 | -0.107626 | -0.108107 | -0.094242 | -0.084411 | -0.085366 | -0.066741 | -0.070642 |
24 rows × 29 columns
# Calculate Pearson correlation between module eigengenes and metadata
correlation_matrix2 = merged_data.corr(method='pearson')
correlation_matrix2
| nCount_RNA | nFeature_RNA | percent.mt | development_stage | male | female | CH | normal | DNMT3A | TET2 | ... | Module_9 | Module_10 | Module_11 | Module_12 | Module_13 | Module_14 | Module_15 | Module_16 | Module_17 | Module_18 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| nCount_RNA | 1.000000 | 0.984470 | -0.036198 | -0.104184 | 0.221317 | -0.221317 | -0.079895 | 0.079895 | -0.118003 | 0.044151 | ... | 0.787767 | -0.062271 | -0.215561 | -0.072632 | -0.068158 | 0.166368 | 0.166368 | 0.368870 | -0.091234 | -0.154810 |
| nFeature_RNA | 0.984470 | 1.000000 | -0.046930 | -0.064074 | 0.284235 | -0.284235 | -0.070985 | 0.070985 | -0.094982 | 0.029101 | ... | 0.737328 | -0.040462 | -0.219377 | -0.035284 | -0.079582 | 0.191379 | 0.191379 | 0.368568 | -0.067501 | -0.170477 |
| percent.mt | -0.036198 | -0.046930 | 1.000000 | -0.450374 | -0.403977 | 0.403977 | -0.205453 | 0.205453 | 0.208230 | -0.411947 | ... | 0.046851 | 0.182254 | -0.174404 | -0.313359 | 0.109875 | -0.209733 | -0.209733 | -0.125638 | 0.211302 | -0.296327 |
| development_stage | -0.104184 | -0.064074 | -0.450374 | 1.000000 | 0.368332 | -0.368332 | -0.045963 | 0.045963 | -0.261424 | 0.224159 | ... | -0.158508 | -0.336576 | 0.051937 | 0.197629 | 0.294757 | 0.327133 | 0.327133 | 0.003373 | 0.116689 | -0.012816 |
| male | 0.221317 | 0.284235 | -0.403977 | 0.368332 | 1.000000 | -1.000000 | 0.260360 | -0.260360 | 0.066667 | 0.182574 | ... | 0.161515 | -0.269191 | 0.161515 | 0.161515 | -0.269191 | 0.161515 | 0.161515 | 0.161515 | -0.269191 | 0.161515 |
| female | -0.221317 | -0.284235 | 0.403977 | -0.368332 | -1.000000 | 1.000000 | -0.260360 | 0.260360 | -0.066667 | -0.182574 | ... | -0.161515 | 0.269191 | -0.161515 | -0.161515 | 0.269191 | -0.161515 | -0.161515 | -0.161515 | 0.269191 | -0.161515 |
| CH | -0.079895 | -0.070985 | -0.205453 | -0.045963 | 0.260360 | -0.260360 | 1.000000 | -1.000000 | 0.497050 | 0.453743 | ... | -0.324946 | 0.133801 | 0.133801 | -0.324946 | -0.324946 | 0.133801 | 0.133801 | 0.133801 | -0.324946 | 0.133801 |
| normal | 0.079895 | 0.070985 | 0.205453 | 0.045963 | -0.260360 | 0.260360 | -1.000000 | 1.000000 | -0.497050 | -0.453743 | ... | 0.324946 | -0.133801 | -0.133801 | 0.324946 | 0.324946 | -0.133801 | -0.133801 | -0.133801 | 0.324946 | -0.133801 |
| DNMT3A | -0.118003 | -0.094982 | 0.208230 | -0.261424 | 0.066667 | -0.066667 | 0.497050 | -0.497050 | 1.000000 | -0.547723 | ... | -0.161515 | 0.269191 | -0.161515 | -0.161515 | -0.161515 | 0.269191 | 0.269191 | -0.161515 | -0.161515 | -0.161515 |
| TET2 | 0.044151 | 0.029101 | -0.411947 | 0.224159 | 0.182574 | -0.182574 | 0.453743 | -0.453743 | -0.547723 | 1.000000 | ... | -0.147442 | -0.147442 | 0.294884 | -0.147442 | -0.147442 | -0.147442 | -0.147442 | 0.294884 | -0.147442 | 0.294884 |
| NoMutation | 0.079895 | 0.070985 | 0.205453 | 0.045963 | -0.260360 | 0.260360 | -1.000000 | 1.000000 | -0.497050 | -0.453743 | ... | 0.324946 | -0.133801 | -0.133801 | 0.324946 | 0.324946 | -0.133801 | -0.133801 | -0.133801 | 0.324946 | -0.133801 |
| Module_1 | 0.056404 | 0.076897 | 0.052741 | 0.165253 | 0.161515 | -0.161515 | 0.133801 | -0.133801 | -0.161515 | 0.294884 | ... | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 |
| Module_2 | 0.047221 | 0.039503 | -0.409359 | 0.197629 | -0.269191 | 0.269191 | 0.133801 | -0.133801 | -0.161515 | 0.294884 | ... | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 |
| Module_3 | -0.203081 | -0.246991 | 0.459308 | -0.012816 | -0.269191 | 0.269191 | -0.324946 | 0.324946 | -0.161515 | -0.147442 | ... | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 |
| Module_4 | -0.185421 | -0.215925 | -0.225468 | -0.045192 | 0.161515 | -0.161515 | 0.133801 | -0.133801 | 0.269191 | -0.147442 | ... | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 |
| Module_5 | -0.155046 | -0.108346 | -0.097401 | 0.035749 | 0.161515 | -0.161515 | -0.324946 | 0.324946 | -0.161515 | -0.147442 | ... | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 |
| Module_6 | 0.007192 | -0.017450 | 0.390600 | -0.514644 | -0.269191 | 0.269191 | 0.133801 | -0.133801 | 0.269191 | -0.147442 | ... | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 |
| Module_7 | -0.023419 | -0.029531 | 0.179335 | -0.093756 | 0.161515 | -0.161515 | 0.133801 | -0.133801 | 0.269191 | -0.147442 | ... | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 |
| Module_8 | -0.003169 | -0.068076 | -0.000384 | -0.207072 | -0.269191 | 0.269191 | 0.133801 | -0.133801 | -0.161515 | 0.294884 | ... | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 |
| Module_9 | 0.787767 | 0.737328 | 0.046851 | -0.158508 | 0.161515 | -0.161515 | -0.324946 | 0.324946 | -0.161515 | -0.147442 | ... | 1.000000 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 |
| Module_10 | -0.062271 | -0.040462 | 0.182254 | -0.336576 | -0.269191 | 0.269191 | 0.133801 | -0.133801 | 0.269191 | -0.147442 | ... | -0.043478 | 1.000000 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 |
| Module_11 | -0.215561 | -0.219377 | -0.174404 | 0.051937 | 0.161515 | -0.161515 | 0.133801 | -0.133801 | -0.161515 | 0.294884 | ... | -0.043478 | -0.043478 | 1.000000 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 |
| Module_12 | -0.072632 | -0.035284 | -0.313359 | 0.197629 | 0.161515 | -0.161515 | -0.324946 | 0.324946 | -0.161515 | -0.147442 | ... | -0.043478 | -0.043478 | -0.043478 | 1.000000 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 |
| Module_13 | -0.068158 | -0.079582 | 0.109875 | 0.294757 | -0.269191 | 0.269191 | -0.324946 | 0.324946 | -0.161515 | -0.147442 | ... | -0.043478 | -0.043478 | -0.043478 | -0.043478 | 1.000000 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 |
| Module_14 | 0.166368 | 0.191379 | -0.209733 | 0.327133 | 0.161515 | -0.161515 | 0.133801 | -0.133801 | 0.269191 | -0.147442 | ... | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | 1.000000 | 1.000000 | -0.043478 | -0.043478 | -0.043478 |
| Module_15 | 0.166368 | 0.191379 | -0.209733 | 0.327133 | 0.161515 | -0.161515 | 0.133801 | -0.133801 | 0.269191 | -0.147442 | ... | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | 1.000000 | 1.000000 | -0.043478 | -0.043478 | -0.043478 |
| Module_16 | 0.368870 | 0.368568 | -0.125638 | 0.003373 | 0.161515 | -0.161515 | 0.133801 | -0.133801 | -0.161515 | 0.294884 | ... | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | 1.000000 | -0.043478 | -0.043478 |
| Module_17 | -0.091234 | -0.067501 | 0.211302 | 0.116689 | -0.269191 | 0.269191 | -0.324946 | 0.324946 | -0.161515 | -0.147442 | ... | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | 1.000000 | -0.043478 |
| Module_18 | -0.154810 | -0.170477 | -0.296327 | -0.012816 | 0.161515 | -0.161515 | 0.133801 | -0.133801 | -0.161515 | 0.294884 | ... | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | 1.000000 |
29 rows × 29 columns
#The correlation matrix needs to be reformatted into the correct format:
correlation_matrix3 = correlation_matrix2.drop(['nCount_RNA', 'nFeature_RNA', 'percent.mt', 'development_stage',
'male', 'female', 'CH', 'normal', 'DNMT3A', 'TET2', 'NoMutation'])
correlation_matrix3
| nCount_RNA | nFeature_RNA | percent.mt | development_stage | male | female | CH | normal | DNMT3A | TET2 | ... | Module_9 | Module_10 | Module_11 | Module_12 | Module_13 | Module_14 | Module_15 | Module_16 | Module_17 | Module_18 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Module_1 | 0.056404 | 0.076897 | 0.052741 | 0.165253 | 0.161515 | -0.161515 | 0.133801 | -0.133801 | -0.161515 | 0.294884 | ... | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 |
| Module_2 | 0.047221 | 0.039503 | -0.409359 | 0.197629 | -0.269191 | 0.269191 | 0.133801 | -0.133801 | -0.161515 | 0.294884 | ... | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 |
| Module_3 | -0.203081 | -0.246991 | 0.459308 | -0.012816 | -0.269191 | 0.269191 | -0.324946 | 0.324946 | -0.161515 | -0.147442 | ... | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 |
| Module_4 | -0.185421 | -0.215925 | -0.225468 | -0.045192 | 0.161515 | -0.161515 | 0.133801 | -0.133801 | 0.269191 | -0.147442 | ... | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 |
| Module_5 | -0.155046 | -0.108346 | -0.097401 | 0.035749 | 0.161515 | -0.161515 | -0.324946 | 0.324946 | -0.161515 | -0.147442 | ... | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 |
| Module_6 | 0.007192 | -0.017450 | 0.390600 | -0.514644 | -0.269191 | 0.269191 | 0.133801 | -0.133801 | 0.269191 | -0.147442 | ... | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 |
| Module_7 | -0.023419 | -0.029531 | 0.179335 | -0.093756 | 0.161515 | -0.161515 | 0.133801 | -0.133801 | 0.269191 | -0.147442 | ... | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 |
| Module_8 | -0.003169 | -0.068076 | -0.000384 | -0.207072 | -0.269191 | 0.269191 | 0.133801 | -0.133801 | -0.161515 | 0.294884 | ... | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 |
| Module_9 | 0.787767 | 0.737328 | 0.046851 | -0.158508 | 0.161515 | -0.161515 | -0.324946 | 0.324946 | -0.161515 | -0.147442 | ... | 1.000000 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 |
| Module_10 | -0.062271 | -0.040462 | 0.182254 | -0.336576 | -0.269191 | 0.269191 | 0.133801 | -0.133801 | 0.269191 | -0.147442 | ... | -0.043478 | 1.000000 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 |
| Module_11 | -0.215561 | -0.219377 | -0.174404 | 0.051937 | 0.161515 | -0.161515 | 0.133801 | -0.133801 | -0.161515 | 0.294884 | ... | -0.043478 | -0.043478 | 1.000000 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 |
| Module_12 | -0.072632 | -0.035284 | -0.313359 | 0.197629 | 0.161515 | -0.161515 | -0.324946 | 0.324946 | -0.161515 | -0.147442 | ... | -0.043478 | -0.043478 | -0.043478 | 1.000000 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 |
| Module_13 | -0.068158 | -0.079582 | 0.109875 | 0.294757 | -0.269191 | 0.269191 | -0.324946 | 0.324946 | -0.161515 | -0.147442 | ... | -0.043478 | -0.043478 | -0.043478 | -0.043478 | 1.000000 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 |
| Module_14 | 0.166368 | 0.191379 | -0.209733 | 0.327133 | 0.161515 | -0.161515 | 0.133801 | -0.133801 | 0.269191 | -0.147442 | ... | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | 1.000000 | 1.000000 | -0.043478 | -0.043478 | -0.043478 |
| Module_15 | 0.166368 | 0.191379 | -0.209733 | 0.327133 | 0.161515 | -0.161515 | 0.133801 | -0.133801 | 0.269191 | -0.147442 | ... | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | 1.000000 | 1.000000 | -0.043478 | -0.043478 | -0.043478 |
| Module_16 | 0.368870 | 0.368568 | -0.125638 | 0.003373 | 0.161515 | -0.161515 | 0.133801 | -0.133801 | -0.161515 | 0.294884 | ... | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | 1.000000 | -0.043478 | -0.043478 |
| Module_17 | -0.091234 | -0.067501 | 0.211302 | 0.116689 | -0.269191 | 0.269191 | -0.324946 | 0.324946 | -0.161515 | -0.147442 | ... | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | 1.000000 | -0.043478 |
| Module_18 | -0.154810 | -0.170477 | -0.296327 | -0.012816 | 0.161515 | -0.161515 | 0.133801 | -0.133801 | -0.161515 | 0.294884 | ... | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | -0.043478 | 1.000000 |
18 rows × 29 columns
correlation_matrix3 = correlation_matrix3.drop(correlation_matrix3.columns[11:], axis=1)
correlation_matrix3
| nCount_RNA | nFeature_RNA | percent.mt | development_stage | male | female | CH | normal | DNMT3A | TET2 | NoMutation | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Module_1 | 0.056404 | 0.076897 | 0.052741 | 0.165253 | 0.161515 | -0.161515 | 0.133801 | -0.133801 | -0.161515 | 0.294884 | -0.133801 |
| Module_2 | 0.047221 | 0.039503 | -0.409359 | 0.197629 | -0.269191 | 0.269191 | 0.133801 | -0.133801 | -0.161515 | 0.294884 | -0.133801 |
| Module_3 | -0.203081 | -0.246991 | 0.459308 | -0.012816 | -0.269191 | 0.269191 | -0.324946 | 0.324946 | -0.161515 | -0.147442 | 0.324946 |
| Module_4 | -0.185421 | -0.215925 | -0.225468 | -0.045192 | 0.161515 | -0.161515 | 0.133801 | -0.133801 | 0.269191 | -0.147442 | -0.133801 |
| Module_5 | -0.155046 | -0.108346 | -0.097401 | 0.035749 | 0.161515 | -0.161515 | -0.324946 | 0.324946 | -0.161515 | -0.147442 | 0.324946 |
| Module_6 | 0.007192 | -0.017450 | 0.390600 | -0.514644 | -0.269191 | 0.269191 | 0.133801 | -0.133801 | 0.269191 | -0.147442 | -0.133801 |
| Module_7 | -0.023419 | -0.029531 | 0.179335 | -0.093756 | 0.161515 | -0.161515 | 0.133801 | -0.133801 | 0.269191 | -0.147442 | -0.133801 |
| Module_8 | -0.003169 | -0.068076 | -0.000384 | -0.207072 | -0.269191 | 0.269191 | 0.133801 | -0.133801 | -0.161515 | 0.294884 | -0.133801 |
| Module_9 | 0.787767 | 0.737328 | 0.046851 | -0.158508 | 0.161515 | -0.161515 | -0.324946 | 0.324946 | -0.161515 | -0.147442 | 0.324946 |
| Module_10 | -0.062271 | -0.040462 | 0.182254 | -0.336576 | -0.269191 | 0.269191 | 0.133801 | -0.133801 | 0.269191 | -0.147442 | -0.133801 |
| Module_11 | -0.215561 | -0.219377 | -0.174404 | 0.051937 | 0.161515 | -0.161515 | 0.133801 | -0.133801 | -0.161515 | 0.294884 | -0.133801 |
| Module_12 | -0.072632 | -0.035284 | -0.313359 | 0.197629 | 0.161515 | -0.161515 | -0.324946 | 0.324946 | -0.161515 | -0.147442 | 0.324946 |
| Module_13 | -0.068158 | -0.079582 | 0.109875 | 0.294757 | -0.269191 | 0.269191 | -0.324946 | 0.324946 | -0.161515 | -0.147442 | 0.324946 |
| Module_14 | 0.166368 | 0.191379 | -0.209733 | 0.327133 | 0.161515 | -0.161515 | 0.133801 | -0.133801 | 0.269191 | -0.147442 | -0.133801 |
| Module_15 | 0.166368 | 0.191379 | -0.209733 | 0.327133 | 0.161515 | -0.161515 | 0.133801 | -0.133801 | 0.269191 | -0.147442 | -0.133801 |
| Module_16 | 0.368870 | 0.368568 | -0.125638 | 0.003373 | 0.161515 | -0.161515 | 0.133801 | -0.133801 | -0.161515 | 0.294884 | -0.133801 |
| Module_17 | -0.091234 | -0.067501 | 0.211302 | 0.116689 | -0.269191 | 0.269191 | -0.324946 | 0.324946 | -0.161515 | -0.147442 | 0.324946 |
| Module_18 | -0.154810 | -0.170477 | -0.296327 | -0.012816 | 0.161515 | -0.161515 | 0.133801 | -0.133801 | -0.161515 | 0.294884 | -0.133801 |
We now need to calculate the p-values for the correlations on the merged data.
# Initialise an empty DataFrame to store p-values
module_p_values = pd.DataFrame(index=module_eigengenes.columns, columns=metadata2.columns)
# Calculate p-values for correlations between module eigengenes and metadata
for module in module_eigengenes.columns:
for metadata_column in metadata2.columns:
# Calculate correlation coefficient and p-value
correlation_coefficient, p_value = pearsonr(module_eigengenes[module], metadata2[metadata_column])
# Store p-value in the DataFrame
module_p_values.loc[module, metadata_column] = p_value
print(module_p_values)
nCount_RNA nFeature_RNA percent.mt development_stage male \
Module_1 0.793491 0.72099 0.806646 0.440304 0.450857
Module_2 0.826566 0.85459 0.046988 0.354618 0.203374
Module_3 0.341225 0.244609 0.023954 0.952607 0.203374
Module_4 0.385701 0.310888 0.289457 0.833917 0.450857
Module_5 0.469428 0.614305 0.650711 0.868287 0.450857
Module_6 0.973395 0.935497 0.059141 0.010078 0.203374
Module_7 0.913504 0.891046 0.401758 0.663019 0.450857
Module_8 0.988275 0.751953 0.998581 0.331617 0.203374
Module_9 0.000005 0.000039 0.827907 0.459441 0.450857
Module_10 0.772535 0.851099 0.39401 0.107796 0.203374
Module_11 0.311724 0.303027 0.415034 0.809543 0.450857
Module_12 0.735911 0.869983 0.135952 0.354618 0.450857
Module_13 0.751662 0.711648 0.609287 0.162049 0.203374
Module_14 0.437182 0.370342 0.3253 0.118677 0.450857
Module_15 0.437182 0.370342 0.3253 0.118677 0.450857
Module_16 0.0761 0.076359 0.55857 0.987522 0.450857
Module_17 0.671581 0.753986 0.321611 0.587131 0.203374
Module_18 0.470112 0.425779 0.15972 0.952607 0.450857
female CH normal DNMT3A TET2 NoMutation
Module_1 0.450857 0.53308 0.53308 0.450857 0.16186 0.53308
Module_2 0.203374 0.53308 0.53308 0.450857 0.16186 0.53308
Module_3 0.203374 0.121306 0.121306 0.450857 0.49175 0.121306
Module_4 0.450857 0.53308 0.53308 0.203374 0.49175 0.53308
Module_5 0.450857 0.121306 0.121306 0.450857 0.49175 0.121306
Module_6 0.203374 0.53308 0.53308 0.203374 0.49175 0.53308
Module_7 0.450857 0.53308 0.53308 0.203374 0.49175 0.53308
Module_8 0.203374 0.53308 0.53308 0.450857 0.16186 0.53308
Module_9 0.450857 0.121306 0.121306 0.450857 0.49175 0.121306
Module_10 0.203374 0.53308 0.53308 0.203374 0.49175 0.53308
Module_11 0.450857 0.53308 0.53308 0.450857 0.16186 0.53308
Module_12 0.450857 0.121306 0.121306 0.450857 0.49175 0.121306
Module_13 0.203374 0.121306 0.121306 0.450857 0.49175 0.121306
Module_14 0.450857 0.53308 0.53308 0.203374 0.49175 0.53308
Module_15 0.450857 0.53308 0.53308 0.203374 0.49175 0.53308
Module_16 0.450857 0.53308 0.53308 0.450857 0.16186 0.53308
Module_17 0.203374 0.121306 0.121306 0.450857 0.49175 0.121306
Module_18 0.450857 0.53308 0.53308 0.450857 0.16186 0.53308
We now need to calculate the adjusted p-values.
# Calculate adjusted p-values using FDR correction
adjusted_p_values = pd.DataFrame(index=module_p_values.index, columns=module_p_values.columns)
for column in module_p_values.columns:
p_values = module_p_values[column].astype(float)
# Perform FDR correction
_, adj_p_values, _, _ = multipletests(p_values, method='fdr_bh')
adjusted_p_values[column] = adj_p_values
print(adjusted_p_values)
nCount_RNA nFeature_RNA percent.mt development_stage male \
Module_1 0.988275 0.935497 0.876607 0.826994 0.450857
Module_2 0.988275 0.935497 0.354845 0.797891 0.450857
Module_3 0.940223 0.935497 0.354845 0.987522 0.450857
Module_4 0.940223 0.935497 0.622551 0.987522 0.450857
Module_5 0.940223 0.935497 0.780854 0.987522 0.450857
Module_6 0.988275 0.935497 0.354845 0.181400 0.450857
Module_7 0.988275 0.935497 0.622551 0.987522 0.450857
Module_8 0.988275 0.935497 0.998581 0.797891 0.450857
Module_9 0.000088 0.000710 0.876607 0.826994 0.450857
Module_10 0.988275 0.935497 0.622551 0.534046 0.450857
Module_11 0.940223 0.935497 0.622551 0.987522 0.450857
Module_12 0.988275 0.935497 0.574992 0.797891 0.450857
Module_13 0.988275 0.935497 0.780854 0.583378 0.450857
Module_14 0.940223 0.935497 0.622551 0.534046 0.450857
Module_15 0.940223 0.935497 0.622551 0.534046 0.450857
Module_16 0.684901 0.687232 0.773405 0.987522 0.450857
Module_17 0.988275 0.935497 0.622551 0.960760 0.450857
Module_18 0.940223 0.935497 0.574992 0.987522 0.450857
female CH normal DNMT3A TET2 NoMutation
Module_1 0.450857 0.533080 0.533080 0.450857 0.485579 0.533080
Module_2 0.450857 0.533080 0.533080 0.450857 0.485579 0.533080
Module_3 0.450857 0.363919 0.363919 0.450857 0.491750 0.363919
Module_4 0.450857 0.533080 0.533080 0.450857 0.491750 0.533080
Module_5 0.450857 0.363919 0.363919 0.450857 0.491750 0.363919
Module_6 0.450857 0.533080 0.533080 0.450857 0.491750 0.533080
Module_7 0.450857 0.533080 0.533080 0.450857 0.491750 0.533080
Module_8 0.450857 0.533080 0.533080 0.450857 0.485579 0.533080
Module_9 0.450857 0.363919 0.363919 0.450857 0.491750 0.363919
Module_10 0.450857 0.533080 0.533080 0.450857 0.491750 0.533080
Module_11 0.450857 0.533080 0.533080 0.450857 0.485579 0.533080
Module_12 0.450857 0.363919 0.363919 0.450857 0.491750 0.363919
Module_13 0.450857 0.363919 0.363919 0.450857 0.491750 0.363919
Module_14 0.450857 0.533080 0.533080 0.450857 0.491750 0.533080
Module_15 0.450857 0.533080 0.533080 0.450857 0.491750 0.533080
Module_16 0.450857 0.533080 0.533080 0.450857 0.485579 0.533080
Module_17 0.450857 0.363919 0.363919 0.450857 0.491750 0.363919
Module_18 0.450857 0.533080 0.533080 0.450857 0.485579 0.533080
Step 2: Correlation Heatmap¶
# Function to annotate heatmap with correlation values and p-values
def annotate_heatmap_with_p_values(ax, correlation_matrix, p_values, threshold=0.05):
stars = np.empty(p_values.shape, dtype='<U2')
stars[p_values > threshold] = ''
stars[p_values <= threshold] = '*'
for i in range(correlation_matrix.shape[0]):
for j in range(correlation_matrix.shape[1]):
# Format annotation string with correlation value
annotation_corr = f"{correlation_matrix.iloc[i, j]:.2f}"
# Format p-value in brackets
annotation_p_value = f"({p_values.iloc[i, j]:.2f})"
# Add star for significant p-values
if p_values.iloc[i, j] <= threshold:
annotation_p_value += '*'
# Add annotations
ax.text(j+0.5, i+0.4, annotation_corr, ha='center', va='center', color='black')
ax.text(j+0.5, i+0.6, annotation_p_value, ha='center', va='center', color='black')
# Visualise the correlation matrix as a heatmap with correlation values and p-values
plt.figure(figsize=(15, 15))
heatmap = sns.heatmap(correlation_matrix3, annot=False, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation between Module Eigengenes and Metadata')
# Annotate heatmap with correlation values and p-values
annotate_heatmap_with_p_values(heatmap, correlation_matrix3, adjusted_p_values)
plt.show()
External Reading:
- PCA: https://www.sartorius.com/en/knowledge/science-snippets/what-is-principal-component-analysis-pca-and-how-it-is-used-507186
- Heatmaps: https://www.atlassian.com/data/charts/heatmap-complete-guide#:~:text=What%20is%20a%20heatmap%3F,in%20the%20corresponding%20cell%20range.
- False Discovery Rate: https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1716-1
Exercise Questions:
- What is a correlation and why is it used in gene co-expression analysis?
- What is the purpose of calculating module eigengenes in the context of gene expression data analysis?
- Explain the rationale behind using PCA to calculate module eigengenes. How does PCA help in capturing the main variation in the gene expression data within a module?
- Why are adjusted p-values used instead of p-values?
- In the context of merging module eigengenes with clinical metadata, why might it be important to drop certain columns like 'donor_id.1', 'scType_celltype', 'tissue_type', 'cell_type', 'tissue', and 'MUTATION'?
- What is fdr_bh and explain it fully?
- Based on the heatmap, what can you summarise?
- It can be important to check for missing values within the data. How would you do this? Also, if there are missing values, how would you handle them in this context?
- Perform FDR adjustment on the p-values and compare the number of significant correlations before and after correction.
- Plot the expression of eigengenes for a selected module against a specific metadata variable and provide an interpretation of the plot.
Answers:
- In the context of gene co-expression analysis, correlation refers to the statistical measure of the strength and direction of association between the expression levels of two genes across a set of samples or conditions. It quantifies the degree to which the expression patterns of two genes are similar or dissimilar across different biological contexts.
- Module eigengenes represent the overall expression profile of gene modules, summarising the main patterns of gene expression within a group of co-expressed genes. This simplifies complex gene expression data, making it easier to study associations between gene expression patterns and clinical variables.
- PCA identifies the principal components that capture the maximum variance in the data. By taking the first principal component as the module eigengene, it provides a single vector that represents the predominant expression pattern of the genes within the module, thereby summarising the main axis of variation.
- Adjusted p-values are used instead of raw p-values in statistical hypothesis testing to correct for multiple comparisons. When conducting multiple statistical tests simultaneously (such as testing multiple genes in gene expression analysis or multiple variables in genomics studies), the probability of obtaining false positive results increases. This increased probability arises due to the cumulative effect of conducting multiple tests, leading to an inflated Type I error rate (false positives).
- These columns might be dropped because they are categorical variables or identifiers that do not directly relate to the continuous expression data or are not relevant for biological interpretation. Including only relevant numerical metadata ensures meaningful correlation analysis.
- FDR_BH stands for False Discovery Rate (FDR) control using the Benjamini-Hochberg (BH) procedure. It is a statistical method used to control the proportion of false positives among all significant results when conducting multiple hypothesis tests simultaneously.
- There are 18 modules in total. Number of counts and number of featurs appear to significantly impact module 9. Although the rest of the modules do not have significant correlations, there are still high positive correlations in terms of single cell data, with individual mutations as well as development stage. This can give an indication for which sets of genes may be impacted by those mutations or disease.
Answer 8:
print(datExpr.isnull().sum().sum())
print(metadata.isnull().sum().sum())
0 0
In this case there are no missing values, but if there were then it's important to calculate first how many missing values there are e.g. per gene within the expression matrix or per donor within the metadata. If there was a high proportion of missing values, then those respective rows would have to be dropped. Alternatively, missing values can also be imputed using mean/median imputation or more sophisticated techniques like K-Nearest Neighbors imputation.
Answer 9:
# Initialise a DataFrame to store p-values
module_p_values = pd.DataFrame(index=module_eigengenes.columns, columns=metadata2.columns)
# Calculate p-values for correlations between module eigengenes and metadata
for module in module_eigengenes.columns:
for metadata_column in metadata2.columns:
# Calculate correlation coefficient and p-value
_, p_value = pearsonr(module_eigengenes[module], metadata2[metadata_column])
# Store p-value in the DataFrame
module_p_values.loc[module, metadata_column] = p_value
print("P-values before FDR adjustment:")
print(module_p_values)
P-values before FDR adjustment:
nCount_RNA nFeature_RNA percent.mt development_stage male \
Module_1 0.793491 0.72099 0.806646 0.440304 0.450857
Module_2 0.826566 0.85459 0.046988 0.354618 0.203374
Module_3 0.341225 0.244609 0.023954 0.952607 0.203374
Module_4 0.385701 0.310888 0.289457 0.833917 0.450857
Module_5 0.469428 0.614305 0.650711 0.868287 0.450857
Module_6 0.973395 0.935497 0.059141 0.010078 0.203374
Module_7 0.913504 0.891046 0.401758 0.663019 0.450857
Module_8 0.988275 0.751953 0.998581 0.331617 0.203374
Module_9 0.000005 0.000039 0.827907 0.459441 0.450857
Module_10 0.772535 0.851099 0.39401 0.107796 0.203374
Module_11 0.311724 0.303027 0.415034 0.809543 0.450857
Module_12 0.735911 0.869983 0.135952 0.354618 0.450857
Module_13 0.751662 0.711648 0.609287 0.162049 0.203374
Module_14 0.437182 0.370342 0.3253 0.118677 0.450857
Module_15 0.437182 0.370342 0.3253 0.118677 0.450857
Module_16 0.0761 0.076359 0.55857 0.987522 0.450857
Module_17 0.671581 0.753986 0.321611 0.587131 0.203374
Module_18 0.470112 0.425779 0.15972 0.952607 0.450857
female CH normal DNMT3A TET2 NoMutation
Module_1 0.450857 0.53308 0.53308 0.450857 0.16186 0.53308
Module_2 0.203374 0.53308 0.53308 0.450857 0.16186 0.53308
Module_3 0.203374 0.121306 0.121306 0.450857 0.49175 0.121306
Module_4 0.450857 0.53308 0.53308 0.203374 0.49175 0.53308
Module_5 0.450857 0.121306 0.121306 0.450857 0.49175 0.121306
Module_6 0.203374 0.53308 0.53308 0.203374 0.49175 0.53308
Module_7 0.450857 0.53308 0.53308 0.203374 0.49175 0.53308
Module_8 0.203374 0.53308 0.53308 0.450857 0.16186 0.53308
Module_9 0.450857 0.121306 0.121306 0.450857 0.49175 0.121306
Module_10 0.203374 0.53308 0.53308 0.203374 0.49175 0.53308
Module_11 0.450857 0.53308 0.53308 0.450857 0.16186 0.53308
Module_12 0.450857 0.121306 0.121306 0.450857 0.49175 0.121306
Module_13 0.203374 0.121306 0.121306 0.450857 0.49175 0.121306
Module_14 0.450857 0.53308 0.53308 0.203374 0.49175 0.53308
Module_15 0.450857 0.53308 0.53308 0.203374 0.49175 0.53308
Module_16 0.450857 0.53308 0.53308 0.450857 0.16186 0.53308
Module_17 0.203374 0.121306 0.121306 0.450857 0.49175 0.121306
Module_18 0.450857 0.53308 0.53308 0.450857 0.16186 0.53308
# Initialize a DataFrame to store adjusted p-values
adjusted_p_values = pd.DataFrame(index=module_p_values.index, columns=module_p_values.columns)
# Apply FDR correction
for column in module_p_values.columns:
p_values = module_p_values[column].astype(float)
_, adj_p_values, _, _ = multipletests(p_values, method='fdr_bh')
adjusted_p_values[column] = adj_p_values
print("Adjusted p-values after FDR correction:")
print(adjusted_p_values)
Adjusted p-values after FDR correction:
nCount_RNA nFeature_RNA percent.mt development_stage male \
Module_1 0.988275 0.935497 0.876607 0.826994 0.450857
Module_2 0.988275 0.935497 0.354845 0.797891 0.450857
Module_3 0.940223 0.935497 0.354845 0.987522 0.450857
Module_4 0.940223 0.935497 0.622551 0.987522 0.450857
Module_5 0.940223 0.935497 0.780854 0.987522 0.450857
Module_6 0.988275 0.935497 0.354845 0.181400 0.450857
Module_7 0.988275 0.935497 0.622551 0.987522 0.450857
Module_8 0.988275 0.935497 0.998581 0.797891 0.450857
Module_9 0.000088 0.000710 0.876607 0.826994 0.450857
Module_10 0.988275 0.935497 0.622551 0.534046 0.450857
Module_11 0.940223 0.935497 0.622551 0.987522 0.450857
Module_12 0.988275 0.935497 0.574992 0.797891 0.450857
Module_13 0.988275 0.935497 0.780854 0.583378 0.450857
Module_14 0.940223 0.935497 0.622551 0.534046 0.450857
Module_15 0.940223 0.935497 0.622551 0.534046 0.450857
Module_16 0.684901 0.687232 0.773405 0.987522 0.450857
Module_17 0.988275 0.935497 0.622551 0.960760 0.450857
Module_18 0.940223 0.935497 0.574992 0.987522 0.450857
female CH normal DNMT3A TET2 NoMutation
Module_1 0.450857 0.533080 0.533080 0.450857 0.485579 0.533080
Module_2 0.450857 0.533080 0.533080 0.450857 0.485579 0.533080
Module_3 0.450857 0.363919 0.363919 0.450857 0.491750 0.363919
Module_4 0.450857 0.533080 0.533080 0.450857 0.491750 0.533080
Module_5 0.450857 0.363919 0.363919 0.450857 0.491750 0.363919
Module_6 0.450857 0.533080 0.533080 0.450857 0.491750 0.533080
Module_7 0.450857 0.533080 0.533080 0.450857 0.491750 0.533080
Module_8 0.450857 0.533080 0.533080 0.450857 0.485579 0.533080
Module_9 0.450857 0.363919 0.363919 0.450857 0.491750 0.363919
Module_10 0.450857 0.533080 0.533080 0.450857 0.491750 0.533080
Module_11 0.450857 0.533080 0.533080 0.450857 0.485579 0.533080
Module_12 0.450857 0.363919 0.363919 0.450857 0.491750 0.363919
Module_13 0.450857 0.363919 0.363919 0.450857 0.491750 0.363919
Module_14 0.450857 0.533080 0.533080 0.450857 0.491750 0.533080
Module_15 0.450857 0.533080 0.533080 0.450857 0.491750 0.533080
Module_16 0.450857 0.533080 0.533080 0.450857 0.485579 0.533080
Module_17 0.450857 0.363919 0.363919 0.450857 0.491750 0.363919
Module_18 0.450857 0.533080 0.533080 0.450857 0.485579 0.533080
# Threshold for significance
significance_threshold = 0.05
# Count significant correlations before adjustment
significant_before = (module_p_values < significance_threshold).sum().sum()
print(f"Number of significant correlations before FDR adjustment: {significant_before}")
# Count significant correlations after adjustment
significant_after = (adjusted_p_values < significance_threshold).sum().sum()
print(f"Number of significant correlations after FDR adjustment: {significant_after}")
Number of significant correlations before FDR adjustment: 5 Number of significant correlations after FDR adjustment: 2
Answer 10:
plt.scatter(metadata['development_stage'], module_eigengenes['Module_1'])
plt.xlabel('Selected Metadata Variable')
plt.ylabel('Module 1 Eigengene')
plt.title('Eigengene Expression vs Metadata Variable')
plt.show()
As can be seen from the scatter plot, development stage appears to have a weak relationship with Module 1 as the dots are spread out. There is also an outlier within the data. There also appears to be no apparent trend in the data. This may be due to the few numbers of samples as well as due to the sparse nature of single-cell data.