SQL Server Forums
Profile | Register | Active Topics | Members | Search | Forum FAQ
 
Register Now and get your question answered!
Username:
Password:
Save Password
Forgot your Password?

 All Forums
 SQL Server 2008 Forums
 Transact-SQL (2008)
 45 Million row table, need to get a unique count
 New Topic  Reply to Topic
 Printer Friendly
Author Previous Topic Topic Next Topic  

Mnemonic
Yak Posting Veteran

58 Posts

Posted - 07/12/2012 :  12:33:50  Show Profile  Reply with Quote
Hey all,

I have a simple query for counting the distinct amount of users playing a game, however since the table has grown to 45 million rows its taking about 30 seconds. Is there any way to make this more efficient? The hashed match is taking about 37% of the execution plan and the rest is hitting the index. Due to the way this table works there can be many rows with the same username in the playername column.

SELECT COUNT(DISTINCT PlayerName) AS UniqueCount FROM [Players].[dbo].[PlayerStateChangeEvents]


Execution Plan:
ShowPlanXML xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema" Version="1.1" Build="10.50.2811.0" xmlns="http://schemas.microsoft.com/sqlserver/2004/07/showplan">
<BatchSequence>
<Batch>
<Statements>
<StmtSimple StatementCompId="1" StatementEstRows="1" StatementId="1" StatementOptmLevel="FULL" StatementSubTreeCost="624.379" StatementText="SELECT COUNT(DISTINCT PlayerName) AS UniqueCount FROM [MCPTest].[dbo].[PlayerStateChangeEvents]#xD;#xA;" StatementType="SELECT" QueryHash="0x5BF1F66868E0A395" QueryPlanHash="0x4E3436E1C79F4932">
<StatementSetOptions ANSI_NULLS="true" ANSI_PADDING="true" ANSI_WARNINGS="true" ARITHABORT="true" CONCAT_NULL_YIELDS_NULL="true" NUMERIC_ROUNDABORT="false" QUOTED_IDENTIFIER="true" />
<QueryPlan CachedPlanSize="32" CompileTime="3" CompileCPU="3" CompileMemory="272">
<RelOp AvgRowSize="11" EstimateCPU="1E-07" EstimateIO="0" EstimateRebinds="0" EstimateRewinds="0" EstimateRows="1" LogicalOp="Compute Scalar" NodeId="1" Parallel="false" PhysicalOp="Compute Scalar" EstimatedTotalSubtreeCost="624.379">
<OutputList>
<ColumnReference Column="Expr1003" />
</OutputList>
<ComputeScalar>
<DefinedValues>
<DefinedValue>
<ColumnReference Column="Expr1003" />
<ScalarOperator ScalarString="CONVERT_IMPLICIT(int,[globalagg1005],0)">
<Convert DataType="int" Style="0" Implicit="true">
<ScalarOperator>
<Identifier>
<ColumnReference Column="globalagg1005" />
</Identifier>
</ScalarOperator>
</Convert>
</ScalarOperator>
</DefinedValue>
</DefinedValues>
<RelOp AvgRowSize="15" EstimateCPU="1.7E-06" EstimateIO="0" EstimateRebinds="0" EstimateRewinds="0" EstimateRows="1" LogicalOp="Aggregate" NodeId="2" Parallel="false" PhysicalOp="Stream Aggregate" EstimatedTotalSubtreeCost="624.379">
<OutputList>
<ColumnReference Column="globalagg1005" />
</OutputList>
<StreamAggregate>
<DefinedValues>
<DefinedValue>
<ColumnReference Column="globalagg1005" />
<ScalarOperator ScalarString="SUM([partialagg1004])">
<Aggregate AggType="SUM" Distinct="false">
<ScalarOperator>
<Identifier>
<ColumnReference Column="partialagg1004" />
</Identifier>
</ScalarOperator>
</Aggregate>
</ScalarOperator>
</DefinedValue>
</DefinedValues>
<RelOp AvgRowSize="15" EstimateCPU="0.0285021" EstimateIO="0" EstimateRebinds="0" EstimateRewinds="0" EstimateRows="2" LogicalOp="Gather Streams" NodeId="3" Parallel="true" PhysicalOp="Parallelism" EstimatedTotalSubtreeCost="624.379">
<OutputList>
<ColumnReference Column="partialagg1004" />
</OutputList>
<Parallelism>
<RelOp AvgRowSize="15" EstimateCPU="0.0057159" EstimateIO="0" EstimateRebinds="0" EstimateRewinds="0" EstimateRows="2" LogicalOp="Aggregate" NodeId="4" Parallel="true" PhysicalOp="Stream Aggregate" EstimatedTotalSubtreeCost="624.351">
<OutputList>
<ColumnReference Column="partialagg1004" />
</OutputList>
<StreamAggregate>
<DefinedValues>
<DefinedValue>
<ColumnReference Column="partialagg1004" />
<ScalarOperator ScalarString="Count(*)">
<Aggregate AggType="countstar" Distinct="false" />
</ScalarOperator>
</DefinedValue>
</DefinedValues>
<RelOp AvgRowSize="9" EstimateCPU="1.33076" EstimateIO="0.00563063" EstimateRebinds="0" EstimateRewinds="0" EstimateRows="19051.3" LogicalOp="Distinct Sort" NodeId="5" Parallel="true" PhysicalOp="Sort" EstimatedTotalSubtreeCost="624.345">
<OutputList />
<MemoryFractions Input="1" Output="1" />
<Sort Distinct="true">
<OrderBy>
<OrderByColumn Ascending="true">
<ColumnReference Database="[Players]" Schema="[dbo]" Table="[PlayerStateChangeEvents]" Column="PlayerName" />
</OrderByColumn>
</OrderBy>
<RelOp AvgRowSize="75" EstimateCPU="3.93988" EstimateIO="0" EstimateRebinds="0" EstimateRewinds="0" EstimateRows="38102.7" LogicalOp="Repartition Streams" NodeId="6" Parallel="true" PhysicalOp="Parallelism" EstimatedTotalSubtreeCost="623.008">
<OutputList>
<ColumnReference Database="[Players]" Schema="[dbo]" Table="[PlayerStateChangeEvents]" Column="PlayerName" />
</OutputList>
<Parallelism PartitioningType="Hash">
<PartitionColumns>
<ColumnReference Database="[Players]" Schema="[dbo]" Table="[PlayerStateChangeEvents]" Column="PlayerName" />
</PartitionColumns>
<RelOp AvgRowSize="75" EstimateCPU="199.249" EstimateIO="0" EstimateRebinds="0" EstimateRewinds="0" EstimateRows="38102.7" LogicalOp="Partial Aggregate" NodeId="7" Parallel="true" PhysicalOp="Hash Match" EstimatedTotalSubtreeCost="619.069">
<OutputList>
<ColumnReference Database="[Players]" Schema="[dbo]" Table="[PlayerStateChangeEvents]" Column="PlayerName" />
</OutputList>
<MemoryFractions Input="0" Output="0" />
<Hash>
<DefinedValues />
<HashKeysBuild>
<ColumnReference Database="[Players]" Schema="[dbo]" Table="[PlayerStateChangeEvents]" Column="PlayerName" />
</HashKeysBuild>
<BuildResidual>
<ScalarOperator ScalarString="[Players].[dbo].[PlayerStateChangeEvents].[PlayerName] = [Players].[dbo].[PlayerStateChangeEvents].[PlayerName]">
<Compare CompareOp="IS">
<ScalarOperator>
<Identifier>
<ColumnReference Database="[Players]" Schema="[dbo]" Table="[PlayerStateChangeEvents]" Column="PlayerName" />
</Identifier>
</ScalarOperator>
<ScalarOperator>
<Identifier>
<ColumnReference Database="[Players]" Schema="[dbo]" Table="[PlayerStateChangeEvents]" Column="PlayerName" />
</Identifier>
</ScalarOperator>
</Compare>
</ScalarOperator>
</BuildResidual>
<RelOp AvgRowSize="75" EstimateCPU="24.7263" EstimateIO="395.093" EstimateRebinds="0" EstimateRewinds="0" EstimateRows="44956800" LogicalOp="Clustered Index Scan" NodeId="8" Parallel="true" PhysicalOp="Clustered Index Scan" EstimatedTotalSubtreeCost="419.819" TableCardinality="44956800">
<OutputList>
<ColumnReference Database="[Players]" Schema="[dbo]" Table="[PlayerStateChangeEvents]" Column="PlayerName" />
</OutputList>
<IndexScan Ordered="false" ForcedIndex="false" ForceScan="false" NoExpandHint="false">
<DefinedValues>
<DefinedValue>
<ColumnReference Database="[Players]" Schema="[dbo]" Table="[PlayerStateChangeEvents]" Column="PlayerName" />
</DefinedValue>
</DefinedValues>
<Object Database="[Players]" Schema="[dbo]" Table="[PlayerStateChangeEvents]" Index="[PK_PlayerStateChangeEvents]" IndexKind="Clustered" />
</IndexScan>
</RelOp>
</Hash>
</RelOp>
</Parallelism>
</RelOp>
</Sort>
</RelOp>
</StreamAggregate>
</RelOp>
</Parallelism>
</RelOp>
</StreamAggregate>
</RelOp>
</ComputeScalar>
</RelOp>
</QueryPlan>
</StmtSimple>
</Statements>
</Batch>
</BatchSequence>
</ShowPlanXML>

nigelrivett
Flowing Fount of Yak Knowledge

United Kingdom
3328 Posts

Posted - 07/12/2012 :  12:39:56  Show Profile  Visit nigelrivett's Homepage  Reply with Quote
Do you have an index on PlayerName?
You might look at maintaining the count when the table is updated - especially if the count is small compared to the number of rows in the table.
You could also make the index smaller by holding a player id rather than a name on this table - should also make the table smaller too.

==========================================
Cursors are useful if you don't know sql.
SSIS can be used in a similar way.
Beer is not cold and it isn't fizzy.
Go to Top of Page

Mnemonic
Yak Posting Veteran

58 Posts

Posted - 07/12/2012 :  12:43:38  Show Profile  Reply with Quote
No, only on ID is there a clustered index. Do you think adding a nonclustered index on PlayerName will help?

I'd love to be able to ID my players with a numerical value but thats not the way the data is inserted and cant really be changed. The Playername is data sent to me via way of XBox live/PSN etc.
Go to Top of Page

nigelrivett
Flowing Fount of Yak Knowledge

United Kingdom
3328 Posts

Posted - 07/12/2012 :  12:46:23  Show Profile  Visit nigelrivett's Homepage  Reply with Quote
>> Do you think adding a nonclustered index on PlayerName will help?
Yes it would help this query. Might slow down other things though.
You will probably find you are very limited if you only have a clustered index on an identity?
Everything that doesn't use that column to filter will probably table scan.

==========================================
Cursors are useful if you don't know sql.
SSIS can be used in a similar way.
Beer is not cold and it isn't fizzy.
Go to Top of Page

Mnemonic
Yak Posting Veteran

58 Posts

Posted - 07/12/2012 :  15:52:01  Show Profile  Reply with Quote
Actually, i dont think i can do that. The developers made it a varchar(MAX) column, which is an invalid type for an index.
Go to Top of Page

tkizer
Almighty SQL Goddess

USA
35007 Posts

Posted - 07/12/2012 :  16:03:45  Show Profile  Visit tkizer's Homepage  Reply with Quote
Fix the data type (max doesn't make sense for people's names) and then add an index to it.

Tara Kizer
Microsoft MVP for Windows Server System - SQL Server
http://weblogs.sqlteam.com/tarad/

Subscribe to my blog
Go to Top of Page

LoztInSpace
Aged Yak Warrior

876 Posts

Posted - 07/12/2012 :  23:28:52  Show Profile  Reply with Quote
Sounds to me like you're missing an entire table (Player).
Go to Top of Page
  Previous Topic Topic Next Topic  
 New Topic  Reply to Topic
 Printer Friendly
Jump To:
SQL Server Forums © 2000-2009 SQLTeam Publishing, LLC Go To Top Of Page
This page was generated in 0.12 seconds. Powered By: Snitz Forums 2000