RefineClusterer Class Reference
Collaboration diagram for RefineClusterer:

Classes

struct  point_info
 

Public Types

typedef int32 LocalInt
 
typedef uint_smaller ClustIndexInt
 

Public Member Functions

 RefineClusterer (const std::vector< Clusterable *> &points, std::vector< Clusterable *> *clusters, std::vector< int32 > *assignments, RefineClustersOptions cfg)
 
BaseFloat Refine ()
 

Private Member Functions

void InitPoint (int32 point)
 
void InitPoints ()
 
void Iterate ()
 
void MovePoint (int32 point, int32 new_index)
 
void UpdateClust (int32 clust)
 
void ProcessPoint (int32 point)
 
void UpdateInfo (int32 point, int32 idx)
 
point_infoGetInfo (int32 point, int32 idx)
 

Private Attributes

const std::vector< Clusterable * > & points_
 
std::vector< Clusterable * > * clusters_
 
std::vector< int32 > * assignments_
 
std::vector< point_infoinfo_
 
std::vector< ClustIndexIntmy_clust_index_
 
std::vector< LocalIntclust_time_
 
std::vector< BaseFloatclust_objf_
 
BaseFloat ans_
 
int32 num_clust_
 
int32 num_points_
 
int32 t_
 
RefineClustersOptions cfg_
 

Detailed Description

Definition at line 686 of file cluster-utils.cc.

Member Typedef Documentation

◆ ClustIndexInt

Definition at line 693 of file cluster-utils.cc.

◆ LocalInt

typedef int32 LocalInt

Definition at line 692 of file cluster-utils.cc.

Constructor & Destructor Documentation

◆ RefineClusterer()

RefineClusterer ( const std::vector< Clusterable *> &  points,
std::vector< Clusterable *> *  clusters,
std::vector< int32 > *  assignments,
RefineClustersOptions  cfg 
)
inline

Definition at line 695 of file cluster-utils.cc.

References BottomUpClusterer::ans_, BottomUpClusterer::clusters_, rnnlm::i, KALDI_ASSERT, and BottomUpClusterer::points_.

699  : points_(points), clusters_(clusters), assignments_(assignments),
700  cfg_(cfg) {
701  KALDI_ASSERT(cfg_.top_n >= 2);
702  num_points_ = points_.size();
703  num_clust_ = static_cast<int32> (clusters->size());
704 
705  // so can fit clust-id in LocalInt
707  = static_cast<int32> (num_clust_);
708  KALDI_ASSERT(cfg_.top_n == static_cast<int32>(static_cast<ClustIndexInt>(cfg_.top_n)));
709  t_ = 0;
711  // will set all PointInfo's to 0 too (they will be up-to-date).
712  clust_time_.resize(num_clust_, 0);
713  clust_objf_.resize(num_clust_);
714  for (int32 i = 0; i < num_clust_; i++)
715  clust_objf_[i] = (*clusters_)[i]->Objf();
716  info_.resize(num_points_ * cfg_.top_n);
717  ans_ = 0;
718  InitPoints();
719  }
std::vector< ClustIndexInt > my_clust_index_
std::vector< point_info > info_
std::vector< BaseFloat > clust_objf_
RefineClustersOptions cfg_
kaldi::int32 int32
std::vector< LocalInt > clust_time_
std::vector< int32 > * assignments_
const std::vector< Clusterable * > & points_
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
std::vector< Clusterable * > * clusters_

Member Function Documentation

◆ GetInfo()

point_info& GetInfo ( int32  point,
int32  idx 
)
inlineprivate

Definition at line 868 of file cluster-utils.cc.

References rnnlm::i, KALDI_ASSERT, and KALDI_PARANOID_ASSERT.

868  {
869  KALDI_ASSERT(point < num_points_ && idx < cfg_.top_n);
870  int32 i = point*cfg_.top_n + idx;
871  KALDI_PARANOID_ASSERT(i < static_cast<int32>(info_.size()));
872  return info_[i];
873  }
std::vector< point_info > info_
RefineClustersOptions cfg_
kaldi::int32 int32
#define KALDI_PARANOID_ASSERT(cond)
Definition: kaldi-error.h:206
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185

◆ InitPoint()

void InitPoint ( int32  point)
inlineprivate

Definition at line 728 of file cluster-utils.cc.

References Clusterable::Add(), RefineClusterer::point_info::clust, Clusterable::Copy(), RefineClusterer::point_info::objf, BottomUpClusterer::points_, and RefineClusterer::point_info::time.

728  {
729  // Find closest clusters to this point.
730  // distances are really negated objf changes, ignoring terms that don't vary with the "other" cluster.
731 
732  std::vector<std::pair<BaseFloat, LocalInt> > distances;
733  distances.reserve(num_clust_-1);
734  int32 my_clust = (*assignments_)[point];
735  Clusterable *point_cl = points_[point];
736 
737  for (int32 clust = 0;clust < num_clust_;clust++) {
738  if (clust != my_clust) {
739  Clusterable *tmp = (*clusters_)[clust]->Copy();
740  tmp->Add(*point_cl);
741  BaseFloat other_clust_objf = clust_objf_[clust];
742  BaseFloat other_clust_plus_me_objf = (*clusters_)[clust]->ObjfPlus(* (points_[point]));
743 
744  BaseFloat distance = other_clust_objf-other_clust_plus_me_objf; // negated delta-objf, with only "varying" terms.
745  distances.push_back(std::make_pair(distance, (LocalInt)clust));
746  delete tmp;
747  }
748  }
749  if ((cfg_.top_n-1-1) >= 0) {
750  std::nth_element(distances.begin(), distances.begin()+(cfg_.top_n-1-1), distances.end());
751  }
752  // top_n-1 is the # of elements we want to retain. -1 because we need the iterator
753  // that points to the end of that range (i.e. not potentially off the end of the array).
754 
755  for (int32 index = 0;index < cfg_.top_n-1;index++) {
756  point_info &info = GetInfo(point, index);
757  int32 clust = distances[index].second;
758  info.clust = clust;
759  BaseFloat distance = distances[index].first;
760  BaseFloat other_clust_objf = clust_objf_[clust];
761  BaseFloat other_clust_plus_me_objf = -(distance - other_clust_objf);
762  info.objf = other_clust_plus_me_objf;
763  info.time = 0;
764  }
765  // now put the last element in, which is my current cluster.
766  point_info &info = GetInfo(point, cfg_.top_n-1);
767  info.clust = my_clust;
768  info.time = 0;
769  info.objf = (*clusters_)[my_clust]->ObjfMinus(*(points_[point]));
770  my_clust_index_[point] = cfg_.top_n-1;
771  }
std::vector< ClustIndexInt > my_clust_index_
std::vector< BaseFloat > clust_objf_
RefineClustersOptions cfg_
kaldi::int32 int32
float BaseFloat
Definition: kaldi-types.h:29
const std::vector< Clusterable * > & points_
point_info & GetInfo(int32 point, int32 idx)

◆ InitPoints()

void InitPoints ( )
inlineprivate

Definition at line 772 of file cluster-utils.cc.

772  {
773  // finds, for each point, the closest cfg_.top_n clusters (including its own cluster).
774  // this may be the most time-consuming step of the algorithm.
775  for (int32 p = 0;p < num_points_;p++) InitPoint(p);
776  }
kaldi::int32 int32
void InitPoint(int32 point)

◆ Iterate()

void Iterate ( )
inlineprivate

Definition at line 777 of file cluster-utils.cc.

References KALDI_WARN.

777  {
778  int32 iter, num_iters = cfg_.num_iters;
779  for (iter = 0;iter < num_iters;iter++) {
780  int32 cur_t = t_;
781  for (int32 point = 0;point < num_points_;point++) {
782  if (t_+1 == 0) {
783  KALDI_WARN << "Stopping iterating at int32 moves";
784  return; // once we use up all time points, must return-- this
785  // should rarely happen as int32 is large.
786  }
787  ProcessPoint(point);
788  }
789  if (t_ == cur_t) break; // nothing changed so we converged.
790  }
791  }
void ProcessPoint(int32 point)
RefineClustersOptions cfg_
kaldi::int32 int32
#define KALDI_WARN
Definition: kaldi-error.h:150

◆ MovePoint()

void MovePoint ( int32  point,
int32  new_index 
)
inlineprivate

Definition at line 792 of file cluster-utils.cc.

References BottomUpClusterer::assignments_, RefineClusterer::point_info::clust, KALDI_ASSERT, and BottomUpClusterer::points_.

792  {
793  // move point to a different cluster.
794  t_++;
795  int32 old_index = my_clust_index_[point]; // index into info
796  // array corresponding to current cluster.
797  KALDI_ASSERT(new_index < cfg_.top_n && new_index != old_index);
798  point_info &old_info = GetInfo(point, old_index),
799  &new_info = GetInfo(point, new_index);
800  my_clust_index_[point] = new_index; // update to new index.
801 
802  int32 old_clust = old_info.clust, new_clust = new_info.clust;
803  KALDI_ASSERT( (*assignments_)[point] == old_clust);
804  (*assignments_)[point] = new_clust;
805  (*clusters_)[old_clust]->Sub( *(points_[point]) );
806  (*clusters_)[new_clust]->Add( *(points_[point]) );
807  UpdateClust(old_clust);
808  UpdateClust(new_clust);
809  }
std::vector< ClustIndexInt > my_clust_index_
RefineClustersOptions cfg_
kaldi::int32 int32
std::vector< int32 > * assignments_
void UpdateClust(int32 clust)
const std::vector< Clusterable * > & points_
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
point_info & GetInfo(int32 point, int32 idx)

◆ ProcessPoint()

void ProcessPoint ( int32  point)
inlineprivate

Definition at line 815 of file cluster-utils.cc.

References BottomUpClusterer::ans_, RefineClusterer::point_info::clust, KALDI_ASSERT, and RefineClusterer::point_info::objf.

815  {
816  // note: calling code uses the fact
817  // that it only ever increases t_ by one.
818  KALDI_ASSERT(point < num_points_);
819  // (1) Make sure own-cluster like is updated.
820  int32 self_index = my_clust_index_[point]; // index <cfg_.top_n of own cluster.
821  point_info &self_info = GetInfo(point, self_index);
822  int32 self_clust = self_info.clust; // cluster index of own cluster.
823  KALDI_ASSERT(self_index < cfg_.top_n);
824  UpdateInfo(point, self_index);
825 
826  float own_clust_objf = clust_objf_[self_clust];
827  float own_clust_minus_me_objf = self_info.objf; // objf of own cluster minus self.
828  // Now check the other "close" clusters and see if we want to move there.
829  for (int32 index = 0;index < cfg_.top_n;index++) {
830  if (index != self_index) {
831  UpdateInfo(point, index);
832  point_info &other_info = GetInfo(point, index);
833  BaseFloat other_clust_objf = clust_objf_[other_info.clust];
834  BaseFloat other_clust_plus_me_objf = other_info.objf;
835  BaseFloat impr = other_clust_plus_me_objf + own_clust_minus_me_objf
836  - other_clust_objf - own_clust_objf;
837  if (impr > 0) { // better to switch...
838  ans_ += impr;
839  MovePoint(point, index);
840  return; // the stuff we precomputed at the top is invalidated now, and it's
841  // easiest just to wait till next time we visit this point.
842  }
843  }
844  }
845  }
std::vector< ClustIndexInt > my_clust_index_
std::vector< BaseFloat > clust_objf_
RefineClustersOptions cfg_
kaldi::int32 int32
float BaseFloat
Definition: kaldi-types.h:29
void MovePoint(int32 point, int32 new_index)
void UpdateInfo(int32 point, int32 idx)
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185
point_info & GetInfo(int32 point, int32 idx)

◆ Refine()

BaseFloat Refine ( )
inline

Definition at line 721 of file cluster-utils.cc.

References BottomUpClusterer::ans_.

Referenced by kaldi::RefineClusters().

721  {
722  if (cfg_.top_n <= 1) return 0.0; // nothing to do.
723  Iterate();
724  return ans_;
725  }
RefineClustersOptions cfg_

◆ UpdateClust()

void UpdateClust ( int32  clust)
inlineprivate

Definition at line 810 of file cluster-utils.cc.

References KALDI_ASSERT.

810  {
811  KALDI_ASSERT(clust < num_clust_);
812  clust_objf_[clust] = (*clusters_)[clust]->Objf();
813  clust_time_[clust] = t_;
814  }
std::vector< BaseFloat > clust_objf_
std::vector< LocalInt > clust_time_
#define KALDI_ASSERT(cond)
Definition: kaldi-error.h:185

◆ UpdateInfo()

void UpdateInfo ( int32  point,
int32  idx 
)
inlineprivate

Definition at line 847 of file cluster-utils.cc.

References Clusterable::Add(), RefineClusterer::point_info::clust, Clusterable::Copy(), Clusterable::Objf(), RefineClusterer::point_info::objf, BottomUpClusterer::points_, Clusterable::Sub(), and RefineClusterer::point_info::time.

847  {
848  point_info &pinfo = GetInfo(point, idx);
849  if (pinfo.time < clust_time_[pinfo.clust]) { // it's not up-to-date...
850  Clusterable *tmp_cl = (*clusters_)[pinfo.clust]->Copy();
851  if (idx == my_clust_index_[point]) {
852  tmp_cl->Sub( *(points_[point]) );
853  } else{
854  tmp_cl->Add( *(points_[point]) );
855  }
856  pinfo.time = t_;
857  pinfo.objf = tmp_cl->Objf();
858  delete tmp_cl;
859  }
860  }
std::vector< ClustIndexInt > my_clust_index_
std::vector< LocalInt > clust_time_
const std::vector< Clusterable * > & points_
point_info & GetInfo(int32 point, int32 idx)

Member Data Documentation

◆ ans_

BaseFloat ans_
private

Definition at line 886 of file cluster-utils.cc.

◆ assignments_

std::vector<int32>* assignments_
private

Definition at line 877 of file cluster-utils.cc.

◆ cfg_

RefineClustersOptions cfg_
private

Definition at line 891 of file cluster-utils.cc.

◆ clust_objf_

std::vector<BaseFloat> clust_objf_
private

Definition at line 884 of file cluster-utils.cc.

◆ clust_time_

std::vector<LocalInt> clust_time_
private

Definition at line 883 of file cluster-utils.cc.

◆ clusters_

std::vector<Clusterable*>* clusters_
private

Definition at line 876 of file cluster-utils.cc.

◆ info_

std::vector<point_info> info_
private

Definition at line 879 of file cluster-utils.cc.

◆ my_clust_index_

std::vector<ClustIndexInt> my_clust_index_
private

Definition at line 880 of file cluster-utils.cc.

◆ num_clust_

int32 num_clust_
private

Definition at line 888 of file cluster-utils.cc.

◆ num_points_

int32 num_points_
private

Definition at line 889 of file cluster-utils.cc.

◆ points_

const std::vector<Clusterable*>& points_
private

Definition at line 875 of file cluster-utils.cc.

◆ t_

int32 t_
private

Definition at line 890 of file cluster-utils.cc.


The documentation for this class was generated from the following file: