Allen Brain Atlas & KEGG: Part I

The code below
  • Downloads donor microarray gene data sets w.r.t. a disorder or disease of interest
  • Queries KEGG for associated pathways.

% Humans
url = 'http://api.brain-map.org/api/v2/data/query.json?criteria=model::Specimen,rma::criteria,specimen_types[id$eq1]';
str = urlread(url);
humanData = parse_json(str);


% Ontology
url = 'http://api.brain-map.org/api/v2/data/query.json?criteria=model::Ontology';
str = urlread(url);
ontologyData = parse_json(str);


% Genes
Disorder = input('Group of Genes w.r.t. which Allen Gene Classification (e.g. ''Epilepsy'') : '); 
url = ['http://api.brain-map.org/api/v2/data/query.json?criteria=model::Gene,rma::criteria,'...
        '[organism_id$eq1],products[id$eq2],gene_classifications[name$il''', Disorder ,'''],rma::include,probes[probe_type$eq''DNA'']'];
str = urlread(url);
genesParameters = parse_json(str);

clc



% KEGG
% Web Service Description Language
WSDL = 'http://soap.genome.jp/KEGG.wsdl';
className = createClassFromWsdl(WSDL);
kegg = KEGG;
%{
    dir @KEGG                   % Automatically Generated Files
    methods(KEGG)               % List of methods.
    classType = class(kegg);    % Confirm that kegg is an instance of "KEGG"
%}
organismsKEGG = list_organisms(kegg);
homoSapiensID = find( ~cellfun(@isempty, regexp({organismsKEGG.entry_id}', 'hsa') ));
homoSapiensPathways = list_pathways(kegg, organismsKEGG(homoSapiensID).entry_id);
homoSapiensPathwaysID = (1:size(homoSapiensPathways, 1))';

clc

% per Gene
geneData = cell(1, 4 + size(humanData.msg, 2));
filteredGeneData  = cell(1, 4 + size(humanData.msg, 2));
pathwaysData = [];
partsData = cell(1);
for j = 1:1:size(genesParameters.msg, 2)   
   
    % Gene
    geneID = genesParameters.msg{1,j}.id;
    keggID = bconv(kegg, ['ncbi-geneid:', num2str(genesParameters.msg{1,j}.entrez_id)]);
    [~, startAt] = regexp(keggID, 'hsa:');
    endAt = strfind(keggID, 'equivalent');
    keggID = strtrim(keggID(startAt + 1:endAt - 1));   
    pathwaysID = get_pathways_by_genes(kegg, {['hsa:', keggID]});
    pathwaysID = regexprep(pathwaysID, 'path:hsa', '');
   

    % Probes
    probesSet = cell(1);
    probesNames = cell(1);
    for k = 1:size(genesParameters.msg{1,j}.probes, 2)
       
        probesSet{k,1} = genesParameters.msg{1,j}.probes{1,k}.id;
        probesNames{k,1} = genesParameters.msg{1,j}.probes{1,k}.name;
       
        Index = strfind(probesNames{k,1}, '_');
        probesNames{k,1}(Index) = ' ';
       
    end
    probesSet = cell2mat(probesSet);
   
   
    % Download Expressions
    [samples, explevels] = download_expression(probesSet);
   
   
    % Structures  
    topStructureID = cell(1);
    structureID = cell(1);
    donorID = cell(1);
    for n = 1:1:size(samples,2)
       
        topStructureID{n,1} = samples{1,n}.top_level_structure.id;       
        structureID{n,1} = samples{1,n}.structure.id;       
        donorID{n,1} = samples{1,n}.donor.id;
       
    end
    topStructureID = cell2mat(topStructureID);
    structureID = cell2mat(structureID);
    donorID = cell2mat(donorID);
    partsID = [topStructureID structureID donorID];   
   
   
      
    % Probes Selection
    if size(probesSet, 1) > 1
       
        compareProbes = cell(1, size(humanData.msg, 2));
        for r = 1:1:size(humanData.msg, 2)
           
            Indices = (donorID == humanData.msg{1,r}.donor_id);
            Parts = partsID(Indices,:);
            Expressions = explevels(Indices,:);
            N = (1:sum(Indices))';
           
           
            % Correlations
            Correlations = corr(Expressions, 'type', 'Spearman');
           
           
            % The Most Correlative Pair
            Correlations = Correlations - eye(size(Correlations));
            [Maximums, cIndices] = max(Correlations, [], 2);
            [~, rIndices] = max(Maximums);
            iCompare = [rIndices cIndices(rIndices)];
           
            % Check
            R = Correlations(rIndices,cIndices(rIndices));
           
           
            compareProbes{1,r} = iCompare;
        end       
       
    end


    % Geometric Means
    Collate = cell(1, size(humanData.msg, 2));
    filteredCollate = cell(1, size(humanData.msg, 2));
    if size(probesSet, 1) > 1
       
        for r = 1:1:size(humanData.msg, 2)           
            Indices = (donorID == humanData.msg{1,r}.donor_id);
            N = (1:sum(Indices))';
            Parts = partsID(Indices,:);
            Expressions = explevels(Indices,:);
            Expressions = Expressions(:, compareProbes{1,r});          
            Expressions(Expressions < 0 | isnan(Expressions)) = 0;           
            uniqueParts = unique(Parts, 'rows');           
            valuesSeries = geomean(Expressions, 2);
            correlationsSeries = zeros(sum(Indices), 1);
            for n = 1:1:size(uniqueParts, 1)
               
                Index = ismember(Parts, uniqueParts(n,:), 'rows');
                iIndex = N(Index);               
                Set = Expressions(Index, :);
                iSet = ~any(Set == 0, 2);               
                if sum(iSet) == 1
                    correlationsSeries(iIndex(iSet)) = 1 - erf(std(Set(iSet,:)));
                    continue;
                elseif sum(iSet) == 0
                    continue;
                end
               
                Correlation = corr(Set(iSet,:), 'type', 'Spearman');
                correlationsSeries(iIndex(iSet)) = Correlation(1,2);               
            end
           
            Collate{1,r} = [geneID*ones(size(valuesSeries)) valuesSeries correlationsSeries Parts];           
        end
       
    else       
        for r = 1:1:size(humanData.msg, 2)           
            Indices = (donorID == humanData.msg{1,r}.donor_id);
            N = (1:sum(Indices))';
            Parts = partsID(Indices,:);
            Expressions = explevels(Indices,:);
            Expressions(Expressions < 0 | isnan(Expressions)) = 0;     
           
            valuesSeries = Expressions;
            correlationsSeries = ones(sum(Indices), 1);
           
            Collate{1,r} = [geneID*ones(size(valuesSeries)) valuesSeries correlationsSeries Parts];           
        end       
    end
   
    geneData(j,:) = [geneID genesParameters.msg{1,j}.entrez_id keggID genesParameters.msg{1,j}.acronym Collate];
   
    for r = 1:1:size(Collate, 2)
        filteredCollate{1,r} = Collate{1,r}(Collate{1,r}(:,3) >= 0.795 & Collate{1,r}(:,2) > 0, :);       
    end
   
    filteredGeneData(j,:) = [geneID genesParameters.msg{1,j}.entrez_id keggID genesParameters.msg{1,j}.acronym filteredCollate];
   
    if ~isempty(pathwaysID)
        pathwaysData = [pathwaysData; [num2cell( geneID*ones( size(pathwaysID, 1), 1) ) pathwaysID]]; %#ok<*AGROW>
    end
   
    partsData{j,1} = partsID;
    
end
partsData = cell2mat(partsData);

fileName = datestr(now, 'ddmmmyyyyHHMMSS');
save(fileName, 'geneData', 'filteredGeneData', 'pathwaysData', 'partsData', 'homoSapiensID', 'homoSapiensPathways', 'homoSapiensPathwaysID')