5  first 1000

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
input_families = "../exp/20240506_test_with_first1000/results/TAIR10.first1000.mcl.tsv"
families_df = pd.read_csv(input_families, sep="\t", names=["gene", "family"])
families_df.head()
gene family
0 AT1G14930 1
1 AT1G14940 1
2 AT1G14950 1
3 AT1G14960 1
4 AT1G23120 1
families_count = families_df.groupby("family").size()
families_count
family
1      6
2      6
3      5
4      5
5      5
      ..
96     2
97     2
98     2
99     2
100    1
Length: 100, dtype: int64
sns.histplot(families_count)

5.1 Larger

input_families = "../exp/mutual/results/20240518_TAIR10_diamond_mcl.mcl.tsv"
families_df = pd.read_csv(input_families, sep="\t", names=["gene", "family"])
families_df.head()
gene family
0 AT3G22150 1
1 AT3G47840 1
2 AT4G32430 1
3 AT2G33680 1
4 AT1G11290 1
families_count = families_df.groupby("family").size()
families_count
family
1       39
2       39
3       37
4       30
5       30
        ..
5339     1
5340     1
5341     1
5342     1
5343     1
Length: 5343, dtype: int64
sns.displot(families_count)
plt.title("Gene families size distribution in TAIR10")
plt.xlabel("Size")
plt.show()

sns.displot(families_count)
plt.yscale('log')
plt.title("Gene families size distribution in TAIR10 (log scale)")
plt.xlabel("Size")
plt.show()