The Canine Data Commons supports the management, analysis and sharing of genomics data for the canine research community and aims to accelerate opportunities for discovery and development for the treatment and prevention of canine cancer.
# Uncomment the lines to install libraries if needed.
# !pip install --force --upgrade gen3 --ignore-installed certifi
# !pip install numpy
# !pip install matplotlib
# !pip install pandas
# !pip install seaborn
# Import libraries:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import os
import seaborn as sns
import re
from pandas import DataFrame
import warnings
warnings.filterwarnings("ignore")
!pip install gen3
# Import Gen SDK tools
import gen3
from gen3.auth import Gen3Auth
from gen3.submission import Gen3Submission
from gen3.index import Gen3Index
# Define the Gen3 API (URL of the Gen3 commons)
endpoint = "https://caninedc.org/"
# Download the credentials JSON under https://caninedc.org/identity and call the path to the JSON file.
creds = "/user/path/canine_creds.json"
# Authentication using the class "Gen3Auth", which generates access tokens.
auth = Gen3Auth(endpoint, creds)
sub = Gen3Submission(endpoint, auth)
home_directory = os.getcwd() # replace with a path if needed.
# First, we need to know which program and project we want to download the structured data from.
# Programs and projects of interest can be found on commons.url/submission, or, in this notebook https://caninedc.org/submission.
# In this notebook, we select program "Canine" and project "NHGRI"
program = "Canine"
project = "NHGRI"
# Now we can search for structured data that is stored under nodes in the NHGRI project.
# All nodes in the NHGRI project can be found on the graph model on https://caninedc.org/Canine-NHGRI.
# For this notebook, we want to take a look at the structured data that is stored under the nodes "subject" and "sample".
# Export the structured data that is stored under the two nodes using the SDK function "export_node":
# Syntax: subject_data = sub.export_node(program, project, node_type, fileformat, filename)
subject_data = sub.export_node(program, project, "subject", "tsv", home_directory + "/subject.tsv")
sample_data = sub.export_node(program, project, "sample", "tsv", home_directory + "/sample.tsv")
# Load the downloaded subject tsv file to the Pandas dataframe with regex delimiter '\t'.
subject = pd.read_csv("subject.tsv", sep='\t', header=0)
# As "subject" is now the dataframe, we can run Pandas functions on it by adding a ".function"
# Return the first 5 rows of the dataframe "subject"
subject.head()
# Commands to show dataframe shape and info:
# 1. Return info of the dataframe using: $ subject.info
# 2. Return the format of the dataframe in (rows, columns) using: $ subject.shape
# Dropping all columns that have NaN as values and replacing the previous dataframe
subject_clean = subject.dropna(axis = 1, how = 'all')
subject_clean.info()
# Return only one column from dataframe "subject". Here we show two options to do this.
# Option 1: call the name of the column in the dataframe
subject_clean['species']
# Option 2: use the function "iloc"
subject_clean.iloc[:, 7]
# Removing columns not necessary for data analysis with the function "drop"
subject_clean_slim = subject_clean.drop(columns=['type', 'id', 'project_id', 'studies.id', 'studies.submitter_id'])
subject_clean_slim.head()
# We can count the occurrences of different breeds using three options.
# Option 1: Use the function "value_counts"
subject_clean_slim_breeds = subject_clean_slim.breed.value_counts()
print(subject_clean_slim_breeds)
# Option 2: Use the function "groupby" and let Pandas show the counts in ascending size order using "sort_values":
subject_clean_slim.groupby('breed').size().sort_values(ascending=False)
# We can directly plot the top entries using matplotlib.pyplot as plt (defined in the beginning)
subject_clean_slim.groupby('breed').size().sort_values(ascending=False).plot(kind='bar')
plt.ylabel('n')
plt.title('Breeds')
plt.xlim(-1, 17.5) # setting the limits to the first 18 entries (instead of the full 132)
plt.show()
# Option 3: We can also show the top entries with the function pivot_table and save it as a new file
countsbreed=subject_clean_slim.pivot_table(index=['breed'], aggfunc='size')
print(countsbreed)
countsbreed.shape
# Save the file to csv
countsbreed.to_csv('countsbreed.csv')
# Loading the saved file
counts_breed = pd.read_csv("countsbreed.csv", header=0)
# Renaming the column names with the function "columns"
counts_breed.columns = ['breed', 'counts']
counts_breed.head(10) # shows the first 10 rows
# Create pie chart of breeds showing only top 13 entries
top13 = counts_breed[counts_breed.counts > 9].nlargest(13, 'counts') # top 13 entries with counts > 9
data = top13['counts']
categories = top13["breed"]
fig1, ax1 = plt.subplots()
ax1.pie(data, labels=categories, autopct='%1.1f%%',
shadow=True, startangle=90)
ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()
# We want to show the breeds that have above 9 counts and store all other breeds as "Other"
# First, we set limits for the counts and separate into two bins (those below 9 counts and those above).
# The first row keeps the count above 9 as the original "breed" and change the counts below 9 to "Other"
counts_breed["new_breed"] = np.where(counts_breed["counts"] >9, counts_breed['breed'], 'Other')
# Using the groupby function from before, we can again count the amount of entries of each breed
count_table = counts_breed.groupby('new_breed').sum() # count_table has now only one column
count_table = count_table.reset_index() # this command resets the index of the table
# Return a pie chart of the results
top14 = count_table[count_table.counts > 9].nlargest(10, 'counts') # show only the top 10 and others
data = top14["counts"]
categories = top14["new_breed"]
fig1, ax1 = plt.subplots()
ax1.pie(data, labels=categories, autopct='%1.1f%%',
shadow=True, startangle=90)
ax1.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
fig1.set_size_inches(10, 10) # Adjust figure size
mpl.rcParams['font.size'] = 11.5 # Adjust font size
plt.show()
# Save the figure above
fig1.savefig('plot.png')
End of notebook.