BALL  1.4.79
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Groups Pages
binaryFingerprintMethods.h
Go to the documentation of this file.
1 // -*- Mode: C++; tab-width: 2; -*-
2 // vi: set ts=2:
3 //
4 
5 #ifndef BALL_STRUCTURE_BINARYFINGERPRINTMETHODS_H
6 #define BALL_STRUCTURE_BINARYFINGERPRINTMETHODS_H
7 
8 #ifndef BALL_COMMON_H
9 # include <BALL/common.h>
10 #endif
11 
12 #ifndef BALL_DATATYPE_OPTIONS_H
13 # include <BALL/DATATYPE/options.h>
14 #endif
15 
16 
17 #include <boost/graph/adjacency_list.hpp>
18 #include <boost/graph/graph_traits.hpp>
19 #include <boost/graph/incremental_components.hpp>
20 #include <boost/thread/mutex.hpp>
21 #include <boost/thread/thread.hpp>
22 #include <boost/unordered_set.hpp>
23 
24 
25 #include <map>
26 #include <set>
27 #include <vector>
28 
29 
30 namespace BALL
31 {
67  {
68  struct InvertedIndex;
69 
74 
75  typedef std::vector<InvertedIndex*> InvertedIndices;
76  typedef std::vector<std::vector<unsigned short> > FingerprintFeatures;
77 
78  typedef boost::adjacency_list<boost::vecS, boost::vecS, boost::undirectedS> SimilarityGraph;
79  typedef boost::graph_traits<SimilarityGraph>::vertex_descriptor Vertex;
80  typedef boost::graph_traits<SimilarityGraph>::vertices_size_type VertexIndex;
81  typedef boost::disjoint_sets<Vertex*, VertexIndex*, boost::find_with_full_path_compression> DisjointSet;
82 
84 
85  public:
90 
95  {
99  static const String BLOCKSIZE;
100 
104  static const String SIM_CUTOFF;
105 
109  static const String PRECISION;
110 
114  static const String STORE_NN;
115 
120  static const String MAX_CLUSTERS;
121 
125  static const String N_THREADS;
126 
130  static const String VERBOSITY;
131  };
132 
137  {
138  static const unsigned short BLOCKSIZE;
139  static const float SIM_CUTOFF;
140  static const float PRECISION;
141  static const unsigned int MAX_CLUSTERS;
142  static const bool STORE_NN;
143  static const unsigned int N_THREADS;
144  static const int VERBOSITY;
145  };
146 
147 
152 
157 
158 
163  BinaryFingerprintMethods(const Options& options);
164 
165 
171  BinaryFingerprintMethods(const Options& options, const FingerprintFeatures& lib_features);
172 
173 
180  BinaryFingerprintMethods(const Options& options, const FingerprintFeatures& lib_features, const FingerprintFeatures& query_features);
181 
182 
187 
188 
192  virtual ~BinaryFingerprintMethods();
193 
195 
200 
204  BinaryFingerprintMethods& operator = (const BinaryFingerprintMethods& bfm);
205 
207 
208 
218  static bool parseBinaryFingerprint(const String& fprint, std::vector<unsigned short>& features, unsigned int fp_type, const char* delim=",");
219 
220 
225 
231  bool setLibraryFeatures(const FingerprintFeatures& lib_features);
232 
233 
239  bool setQueryFeatures(const FingerprintFeatures& query_features);
240 
241 
246  unsigned int getTargetLibrarySize() const;
247 
248 
253  unsigned int getQueryLibrarySize() const;
254 
255 
260  const Options& getOptions() const;
261 
262 
267  void setVerbosityLevel(const int verbosity);
268 
270 
271 
276 
282  bool cutoffSearch(const float sim_cutoff, const String& outfile_name);
283 
284 
295  bool connectedComponents(const std::vector<unsigned int>& selection,
296  std::vector<std::vector<unsigned int> >& ccs,
297  std::vector<std::vector<std::pair<unsigned int, float> > >& nn_data,
298  const float cutoff,
299  const bool store_nns = false);
300 
301 
309  bool averageLinkageClustering(const std::vector<unsigned int>& selection,
310  std::vector<std::pair<unsigned int, float> >& nn_data,
311  std::map<unsigned int, std::vector<unsigned int> >& cluster_selection);
312 
313 
321  bool calculateSelectionMedoid(const std::vector<unsigned int>& selection, unsigned int& medoid_index, std::vector<float>& avg_sims);
322 
324 
325  private:
329  struct ThreadData
330  {
331  LongSize first;
332  LongSize last;
333 
334  float nn_sim;
335  unsigned int blocksize;
336  unsigned int dataset_size;
337  unsigned int active_iids_size;
338 
339  unsigned short* cc_row;
340  unsigned short** cc_matrix;
341 
342  float* float_array;
343  double** dprec_matrix;
344  unsigned int* uint_array;
345 
346  String outfile_name;
347  };
348 
349 
355  struct FeatureList
356  {
357  // FeatureID which corresponds in principle to the index of a bit in a 2D fingerprint.
358  unsigned short feature_id;
359 
360  // Array which stores the sorted list of all molecules which all share the associated feature_id.
361  // Molecules are only represented by their positions in the Block they belong to.
362  unsigned short* block_positions;
363  };
364 
365 
369  struct InvertedIndex
370  {
371  // Number of molecules in the block.
372  unsigned int n_molecules;
373 
374  // Array which stores the number of features of every molecule in the InvertedIndex
375  unsigned short* n_features;
376 
377  // Array which stores the ID of the parent cluster to which the molecule at the InvertedIndex position belongs to.
378  unsigned int* parent_clusters;
379 
380  // Array of FeatureLists implemented as a skip list.
381  FeatureList* feature_skip_list;
382  };
383 
384 
388  struct Cluster
389  {
390  // Molecule ID for leaf nodes. Otherwise internal Cluster ID.
391  unsigned int c_id;
392 
393  // Cluster index [0, n_molecules-1] is used to map from 2D matrix to 1D array.
394  unsigned int c_index;
395 
396  // Number of leaf nodes inside in the corresponding cluster.
397  unsigned int c_size;
398 
399  // Left child. NULL if cluster is a leaf.
400  Cluster* l_child;
401 
402  // Right child. NULL if cluster is a leaf.
403  Cluster* r_child;
404 
405  // Pointer to the clusters current nearest neigbour.
406  Cluster* nn;
407 
408  // Flag for various purposes.
409  bool is_rnn;
410 
411  // Sum of all intra-cluster pairwise similarities within this cluster. Used finally for KGS penalty calculation.
412  double sim_sum;
413 
414  // Pointer to predecessor cluster in the Nearest Neighbour Chain. NULL if not member of the Nearest Neighbour Chain.
415  Cluster* predecessor;
416 
417  // Similarity to predecessor in Nearest Neighbour Chain. 0.0 if not member of Nearest Neighbour Chain.
418  float predecessor_sim;
419 
420  // Sum of all inter-cluster pairwise similarities of clusters nn_chain[i] and nn_chain[i-1]. 0.0 if not member of the NNChain.
421  double predecessor_sim_sum;
422 
423  // Inverted indices of all leaf molecules within this cluster.
424  boost::unordered_set<InvertedIndex*> c_members;
425 
426  // Vector of pointers to FingerprintFeatures in lib_features_ of all leaf molecules within this cluster.
427  std::vector<const std::vector<unsigned short>* > leaf_members;
428  };
429 
430 
431  enum clustering_methods_
432  {
433  STORED_DATA_PARALLEL,
434  STORED_MATRIX
435  };
436 
437 
441  DisjointSet* ds;
442 
443 
447  std::vector<Vertex> parent;
448 
449 
453  std::vector<VertexIndex> rank;
454 
455 
459  Options options_;
460 
461 
465  unsigned int n_threads_;
466 
467 
472  const FingerprintFeatures* lib_features_;
473 
474 
478  InvertedIndices lib_iindices_;
479 
480 
485  const FingerprintFeatures* query_features_;
486 
487 
491  InvertedIndices query_iindices_;
492 
493 
497  boost::thread* threads_;
498 
499 
503  boost::mutex out_mutex_;
504 
505 
509  ThreadData* thread_data_;
510 
511 
515  unsigned short blocksize_;
516 
517 
521  LongSize cc_matrix_size_;
522 
523 
527  float cutoff_;
528 
529 
533  float precision_;
534 
535 
539  bool store_nns_;
540 
541 
547  LongSize n_comparisons_;
548 
549 
553  int verbosity_;
554 
555 
559  std::vector<Cluster*> leaf_clusters_;
560 
561 
565  std::map<float, std::vector<Cluster*> > internal_clusters_;
566 
567 
571  std::vector<Cluster*> vec_actives_;
572 
576  std::vector<Cluster*> vec_inactives_;
577 
578 
582  float* sim_matrix_;
583 
587  double* dprec_sim_matrix_;
588 
589 
593  std::vector<std::pair<std::vector<InvertedIndex*>, unsigned int> > active_iids_;
594 
595 
600  unsigned int active_iids_size_;
601 
602 
606  Cluster* nn_chain_tip_;
607 
608 
612  unsigned int nn_chain_size_;
613 
614 
618  std::vector<Cluster*>::iterator current_nn_;
619 
620 
624  float current_nn_sim_;
625 
626 
630  unsigned short clustering_method_;
631 
632 
636  unsigned int n_clusters_;
637 
638 
643  unsigned int max_clusters_;
644 
645 
650  void setup(const Options& options);
651 
652 
657  void clear();
658 
659 
664  void assign(const BinaryFingerprintMethods& bfm);
665 
666 
672  bool checkInputData(const std::vector<unsigned int>& selection) const;
673 
674 
681  void createThreadData(const unsigned int blocksize, const unsigned int dataset_size, const unsigned int active_iids_size);
682 
683 
687  void destroyThreadData();
688 
689 
695  InvertedIndex* createInvertedIndex(const std::vector<std::pair<const std::vector<unsigned short>*, unsigned int> >& members);
696 
697 
702  void destroyInvertedIndex(InvertedIndex* ii);
703 
704 
709  void destroyInvertedIndices(InvertedIndices& ii_destroy);
710 
711 
717  void createInvertedIndices(const std::vector<std::pair<const std::vector<unsigned short>*, unsigned int> >& molecules, InvertedIndices& ii_target);
718 
719 
724  void setBlockSize(const unsigned short blocksize);
725 
726 
733  void arrayToUpperTriangluarMatrix(LongSize& row, LongSize& col, const LongSize array_index) const;
734 
735 
742  void arrayToStrictUpperTriangularMatrix(LongSize& row, LongSize& col, const LongSize array_index) const;
743 
744 
751  LongSize strictUpperTriangularMatrixToArray(const LongSize row, const LongSize col) const;
752 
753 
759  bool getNextComparisonIndex(LongSize& index);
760 
761 
770  bool checkSimilaritySwitch(const float a_sim, const float b_sim, const unsigned int a_id, const unsigned int b_id) const;
771 
772 
779  void calculateCommonCounts_1_1(const FeatureList* ii1, const FeatureList* ii2, unsigned short& cc_count);
780 
781 
788  void calculateCommonCounts_1_N(const FeatureList* ii1, const FeatureList* ii2, unsigned short* cc_row);
789 
790 
797  void calculateCommonCounts_M_N(const InvertedIndex* ii_1, const InvertedIndex* ii_2, unsigned short** cc_matrix);
798 
799 
807  void cutoffSearchSimilarities(const unsigned int query_index, const unsigned int lib_index, unsigned short** cc_matrix, File& outfile);
808 
809 
814  void cutoffSearchThread(const unsigned int thread_id);
815 
816 
823  typedef void (BinaryFingerprintMethods::*PairwiseSimilaritiesBase)(const unsigned int ii1_index, const unsigned int ii2_index, ThreadData* t_data);
824  PairwiseSimilaritiesBase pairwiseSimilaritiesBase;
825 
826 
833  void pairwiseSimilaritiesNearestNeighbours(const unsigned int ii1_index, const unsigned int ii2_index, ThreadData* t_data);
834 
835 
842  void pairwiseSimilaritiesStoredMatrix(const unsigned int ii1_index, const unsigned int ii2_index, ThreadData* t_data);
843 
844 
852  void pairwiseSimilaritiesConnectedComponents(const unsigned int ii1_index, const unsigned int ii2_index, ThreadData* t_data);
853 
854 
861  void pairwiseSimilaritiesMedoids(const unsigned int ii1_index, const unsigned int ii2_index, ThreadData* t_data);
862 
863 
870  bool pairwiseSimilarities(const std::vector<unsigned int>& selection, std::vector<std::pair<unsigned int, float> >& nn_data);
871 
872 
877  void pairwiseSimilaritiesThread(const unsigned int thread_id);
878 
879 
887  void calculateParallelSimilaritiesActives(const InvertedIndex* ii1, const InvertedIndex* ii2, unsigned short** cc_matrix, double** sim_matrix);
888 
889 
894  void calculateParallelSimilaritiesActivesThread(const unsigned int thread_id);
895 
896 
904  void calculateParallelSimilarities(const InvertedIndex* ii1, const InvertedIndex* ii2, unsigned short** cc_matrix, double** sim_matrix);
905 
906 
911  void calculateParallelSimilaritiesThread(const unsigned int thread_id);
912 
913 
920  void similarityUpdateAverageLinkage(const Cluster* merged_cluster, const Cluster* current);
921 
927  void similarityUpdateAverageLinkageThread(const unsigned int thread_id, const Cluster* merged_cluster);
928 
929 
937  void clusterSimilaritySum_1_N(const InvertedIndex* ii1, const InvertedIndex* ii2, const unsigned short* cc_row, double& sim_sum);
938 
939 
947  void clusterSimilaritySum_M_N(const InvertedIndex* ii1, const InvertedIndex* ii2, unsigned short** cc_matrix, double& sim_sum);
948 
949 
954  void similarityMatrixFromClustersThread(const unsigned int thread_id);
955 
956 
962  void averageLinkageParallel(Cluster*& root);
963 
964 
970  void NNChainCore(Cluster*& root);
971 
972 
978  void clusterSelectionKGS(std::map<unsigned int, std::vector<unsigned int> >& cluster_selection);
979 
980 
986  void enumerateClusterMembers(Cluster* cl, unsigned int cluster_id);
987 
988 
992  void initNNChain();
993 
994 
998  void nextNearestNeighbour();
999 
1000 
1004  void moveNearestNeighbour();
1005 
1006 
1011  void finalizeNNChain(Cluster*& root);
1012 
1013 
1021  Cluster* mergeClusters(Cluster* c1, Cluster* c2, double sim_sum);
1022 
1023 
1028  Cluster* createCluster();
1029 
1033  void switchStorageMethod();
1034 
1035 
1039  void finalizeClustering();
1040  };
1041 } // namespace BALL
1042 
1043 #endif // BALL_STRUCTURE_BINARYFINGERPRINTMETHODS_H
BALL_EXPORT MoleculeList molecules(const AtomContainer &fragment, bool selected_only=false)
BALL_ULONG64_TYPE LongSize
#define BALL_EXPORT
Definition: COMMON/global.h:50
This class provides efficient similarity calculation functionality for 2D binary fingerprints.