From 345086a053ea0921ea01225ef0e69c3956af99c0 Mon Sep 17 00:00:00 2001 From: Gary Oblock Date: Thu, 10 Sep 2020 12:23:29 -0700 Subject: Some preliminary code for performance qualification of instance interleaving. --- gcc/ipa-str-reorg-instance-interleave.c | 283 ++++++++++++++++++++++++++++++-- 1 file changed, 268 insertions(+), 15 deletions(-) (limited to 'gcc/ipa-str-reorg-instance-interleave.c') diff --git a/gcc/ipa-str-reorg-instance-interleave.c b/gcc/ipa-str-reorg-instance-interleave.c index 16633d4f87c..c417926af64 100644 --- a/gcc/ipa-str-reorg-instance-interleave.c +++ b/gcc/ipa-str-reorg-instance-interleave.c @@ -89,14 +89,11 @@ static void set_lhs_for ( gimple *, tree); 1111 bad failure: walk return bogus op */ - -#define TOGSSA1 1 -#define TOGSSA2 1 -#define TOGUPD1 1 -#define TOGUPD2 1 -// ignore TOGUPD3 -#define TOGUPD3 1 -#define TOGUPD4 1 +// These are dummy values tha alway result the reorganization +#define SINGLE_POOL_RAW_SKIP_IT 0.0 +#define SINGLE_POOL_RAW_DO_IT_ALWAYS 0.0 +#define SINGLE_POOL_ABS_SKIP_IT 0.0 +#define SINGLE_POOL_ABS_DO_IT_ALWAYS 0.0 int str_reorg_instance_interleave_qual ( Info *info) @@ -1649,9 +1646,7 @@ str_reorg_instance_interleave_trans ( Info *info) // Should update_stmt be called here? // It does not seem either harm or help so I'll // leave it in. -#if TOGUPD3 update_stmt ( use_stmt); -#endif } // Modify the LHS too // TBD This code needs to be more general. @@ -1664,13 +1659,10 @@ str_reorg_instance_interleave_trans ( Info *info) set_lhs_for ( def, new_ssa_name); - #if TOGUPD4 update_stmt ( def); - #endif + // This is where we know that ssa_name needs to be replaced - #if TOGSSA2 release_ssa_name_fn ( func, ssa_name); - #endif } INDENT(-4); @@ -1885,20 +1877,252 @@ str_reorg_instance_interleave_type_part ( Info *info) create_new_types ( info); } +// Typse for performance qualification + +typedef struct reorg_bb_info reorg_bb_info_t; + struct reorg_bb_info { + basic_block *bb; + }; + + typedef struct perf_bb_info perf_bb_info_t; + typedef struct acc_info acc_info_t; + typedef struct var_info var_info_t; + + struct var_info { + varpool_node *var; + sbitmap *bits; + double count; + }; + + struct acc_info { + varpool_node *v; + int field_num; + }; + + struct perf_bb_info { + std::vector *vari; + basic_block *gcc_bb; + }; + +static void account_for_use( tree, std::vector *); +static bool is_array_access( tree); + static unsigned int reorg_perf_qual ( Info *info) { + #if 1 // TBD use design in doc but mark ReorgTypes // (do_instance_interleave) that qualify instead of deleting them // unless both dead field elimination and field reorderig are not // viable (use do_dead_field_elim and do_field_reorder in // Reorg_type_t.) - + // For the mean time assume if a ReorgType made it here then it's qualified. for ( int i = 0; i < info->reorg_type->size (); i++ ) { (*(info->reorg_type))[i].do_instance_interleave = true; } + #else + // We are doing a quick and dirty version of performance + // qualification for testing purposes and possibly the + // initial version of for the main branch. + auto reorg_types = info->reorg_type; + + // These are floating point numbers because of the fractional + // accesses associated the probabilistic approach taken + // below. By taken below I refer to full algorithm and the + // quick and dirty initial version. + + // reorg was not possible for these + double cache_accesses = 0.0; + + // reorg possible for these. This doesn't mean it is + // profitable or even legal for all the types lumped + // in together here. However, the sum of this and + // cache_accesses should account for all the array + // accesses in the program. + double cache_accesses_noreorg = 0.0; + + // Perf Analysis + struct cgraph_node *node; + FOR_EACH_FUNCTION_WITH_GIMPLE_BODY ( node) { + struct function *func = DECL_STRUCT_FUNCTION ( node->decl); + // Ulgy GCC idiom with global pointer to current function. + push_cfun ( func); + // TBD + class loop *loop; + FOR_EACH_LOOP_FN ( func, loop, LI_ONLY_INNERMOST ) + { + size_t num_bbs = loop->num_nodes; + basic_block *bbs = get_loop_body ( loop); + + // This stuff probably doesn't matter + #if 0 + int *bbsort = XNEWVEC ( int, num_bbs); + reorg_bb_info_t *rbbs = + XNEWVEC ( reorg_bb_info_t, num_bbs); + topsort( bbs, bbsort, loop->num_nodes); + for( i = 0; i < num_bbs; i++ ) { + rbbs[i].bb = &bbs[bbsort[i]]; + } + map bbmap; + #endif + + // TBD Stuff here + + // Obtain loop count by looking at all the block counts. + unsigned max_count = 0; + for ( unsigned i = 0; i < loop->num_nodes; i++) + { + basic_block bb = bbs [i]; + max_count = MAX( max_count, bb->count.value ()); + } + DEBUG_L("max_count = %d, nb_iterations_estimate = %ld\n", + max_count, loop->nb_iterations_estimate); + + // Originally this was done per bb but now it has to be per + // loop. TBD But perf_bb is per loop so we need something similar + // per loop. + + std::vector pv = bb->vari; + for( auto pvi = pv->begin (); pvi != pv->end (); pv = pvi++ ) { // 676 + tree base_type = base_type_of( pvi->var->decl); + ReorgType_t *ri = get_reorgtype_info( base_type, info); + // Reorg accounting + if( ri != NULL ) { + double reorg_nca = 0.0; + int nf = number_of_fields_of( base_type); + int fi; + for( fi = 0; fi < nf; fi++ ) { // 684 + if( bitmap_bit_p( fi, pv->bits) ) { + int fld_width = field_width( base_type, fi); + reorg_nca += pvi->count * alignment_effect( fld_width); + } + } + ri->reorg_perf += reorg_nca; + } // 699 + + // regular accounting + double regular_nca = 0.0; + sbitmap *cache_model = sbitmap_alloc(1); + // TBD NOTE, pv steps on the pv above. + vector pv2 = perf_bb->vari; + for( auto pv2i = pv2->begin (); pv2i != pv2->end; pv2i++ ) { // 704 + tree base_type = base_type_of( pv2i->var->decl); + // create a tiny model of the cache big + // enough for this record. + int len = + ((length( base_type) + L1_CACHE_LINE_SIZE -1) + / + L1_CACHE_LINE_SIZE) + + + 1; + cache_model = sbitmap_resize( cache_model, len, 0); + int nf = number_of_fields_of( base_type); + int nrbo = number of record base offsets + double accum = 0.0; + for( rboi = 0; rboi < nrbo; rboi++ ) { + base_offset = offset_for( rboi); + // Access accounting + int fi; + for( fi = 0; fi < nf; fi++ ) { + if( bitmap_bit_p( fi, pv2i->bits) ) { + int fld_width = field_width( base_type, fi); + int fld_offset = field_offset( base_type, fi); + int chari; + for( chari = 0; chari < fld_width; chari++ ) { + int loc = (chari + field_offset + base_offset) + / + L1_CACHE_LINE_SIZE; + bitmap_set_bit(cache_model, loc); + } + } + } + accum += popcount( cache_model); + bitmap_clear( cache_model); + } + regular_nca += accum/nrbo; + + } // 739 + sbitmap_free( cache_model); + + if( ri != NULL ) { + ri->regular_perf += regular_nca; + cache_accesses_noreorg += regular_nca; + } else { + cache_accesses += regular_nca; + } + } // end for each prop_var 748 + + + } // + pop_cfun (); + } + + // TBD Somebody somewhere needs to compute: + // reorg_perf + // regular_perf + // cache_accesses + // cache_accesses_noreorg + + double total_cache_accesses = + cache_accesses + cache_accesses_noreorg; + + info->total_cache_accesses = total_cache_accesses; + + // + // Decide which reorgTypes fail performance qualification + // + + // We have the total accesses per type. Use that to see + // if the type qualifies. + for ( auto reorgi = reorg_types->begin (); + reorgi != reorg_types->end (); reorgi++ ) + { + double with_opt = reorgi->instance_interleave.reorg_perf; + double wihtout_opt = reorgi->instance_interleave.regular_perf; + double raw_effect = with_opt/wihtout_opt; + double absolute_effect = + (wihtout_opt - with_opt) / total_cache_accesses; + + // Note, there would need to be a multi-pool case here if + // that is every done. + + // If the relative effect is small enough don't bother. + if ( raw_effect < SINGLE_POOL_RAW_SKIP_IT ) + { + reorgi->do_instance_interleave = false; + continue; + } + // the relative effect is big enough do it anyway + // otherwise look at the absolute effect. + if ( raw_effect >= SINGLE_POOL_RAW_DO_IT_ALWAYS ) + { + reorgi->do_instance_interleave = true; + continue; + } + if ( absolute_effect < SINGLE_POOL_ABS_SKIP_IT ) + { + reorgi->do_instance_interleave = false; + continue; + } + if ( absolute_effect >= SINGLE_POOL_ABS_DO_IT_ALWAYS ) + { + reorgi->do_instance_interleave = true; + continue; + } + + // We fitted a linear equation to the corners of the + // effects above and use it to determine when + // to disqualify a type + double cut_off = cut_off_eq_single_pool ( absolute_effect); + if ( raw_effect < cut_off ) + { + reorgi->do_instance_interleave = false; + } + + } + #endif } static void @@ -1919,6 +2143,35 @@ header ( bool initialize ) } } +// TBD I have doubts that this what is really needed +bool +is_array_access( tree acc) +{ + tree type = TREE_TYPE ( acc); + if( TREE_CODE( type) == ARRAY_TYPE ) + return true; + while( POINTER_TYPE_P( type) ) { + type = TREE_TYPE( type); + if( TREE_CODE( type) == ARRAY_TYPE ) + return true; + } + return false; +} + +static void +account_for_use( tree acc, std::vector *acc_info) +{ + // determine element of access + // find field access number i + // find var v + varpool_node *v; + int i; + // TBD + acc_info_t ai = { v, i}; + acc_info->push_back( ai); +} + + // create_new_types has to crawl "all" the // types, create new types and transform // other types that must be changed. -- cgit v1.2.3