Fixed timing problem
[mw/milkymist.git] / cores / tmu2 / rtl / tmu2_texcache.v
1 /*
2  * Milkymist VJ SoC
3  * Copyright (C) 2007, 2008, 2009, 2010 Sebastien Bourdeauducq
4  *
5  * This program is free software: you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation, version 3 of the License.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
16  */
17
18 module tmu2_texcache #(
19         parameter cache_depth = 13, /* < log2 of the capacity in 8-bit words */
20         parameter fml_depth = 26
21 ) (
22         input sys_clk,
23         input sys_rst,
24
25         output [fml_depth-1:0] fml_adr,
26         output reg fml_stb,
27         input fml_ack,
28         input [63:0] fml_di,
29
30         input flush,
31         output reg busy,
32
33         input pipe_stb_i,
34         output reg pipe_ack_o,
35         input [fml_depth-1-1:0] dadr, /* in 16-bit words */
36         input [fml_depth-1-1:0] tadra,
37         input [fml_depth-1-1:0] tadrb,
38         input [fml_depth-1-1:0] tadrc,
39         input [fml_depth-1-1:0] tadrd,
40         input [5:0] x_frac,
41         input [5:0] y_frac,
42
43         output reg pipe_stb_o,
44         input pipe_ack_i,
45         output [fml_depth-1-1:0] dadr_f, /* in 16-bit words */
46         output [15:0] tcolora,
47         output [15:0] tcolorb,
48         output [15:0] tcolorc,
49         output [15:0] tcolord,
50         output [5:0] x_frac_f,
51         output [5:0] y_frac_f
52 );
53
54 /*
55  * To make bit index calculations easier,
56  * we work with 8-bit granularity EVERYWHERE, unless otherwise noted.
57  */
58
59 /*
60  * Line length is the burst length, that is 4*64 bits, or 32 bytes
61  * Addresses are split as follows:
62  *
63  * |             TAG            |         INDEX          |   OFFSET   |
64  * |fml_depth-1      cache_depth|cache_depth-1          5|4          0|
65  *
66  */
67
68 /* MEMORIES */
69 wire [fml_depth-1:0] indexa;
70 wire [fml_depth-1:0] indexb;
71 wire [fml_depth-1:0] indexc;
72 wire [fml_depth-1:0] indexd;
73
74 reg ram_ce;
75
76 wire [31:0] datamem_d1;
77 wire [31:0] datamem_d2;
78 wire [31:0] datamem_d3;
79 wire [31:0] datamem_d4;
80
81 reg datamem_we;
82 wire [cache_depth-3-1:0] datamem_aw;
83
84 tmu2_qpram32 #(
85         .depth(cache_depth-2)
86 ) datamem (
87         .sys_clk(sys_clk),
88         .ce(ram_ce),
89         
90         .a1(indexa[cache_depth-1:2]),
91         .d1(datamem_d1),
92         .a2(indexb[cache_depth-1:2]),
93         .d2(datamem_d2),
94         .a3(indexc[cache_depth-1:2]),
95         .d3(datamem_d3),
96         .a4(indexd[cache_depth-1:2]),
97         .d4(datamem_d4),
98
99         .we(datamem_we),
100         .aw(datamem_aw),
101         .dw(fml_di)
102 );
103
104 wire [1+fml_depth-cache_depth-1:0] tagmem_d1; /* < valid bit + tag */
105 wire [1+fml_depth-cache_depth-1:0] tagmem_d2;
106 wire [1+fml_depth-cache_depth-1:0] tagmem_d3;
107 wire [1+fml_depth-cache_depth-1:0] tagmem_d4;
108
109 reg tagmem_we;
110 wire [cache_depth-1-5:0] tagmem_aw;
111 wire [1+fml_depth-cache_depth-1:0] tagmem_dw;
112
113 tmu2_qpram #(
114         .depth(cache_depth-5),
115         .width(1+fml_depth-cache_depth)
116 ) tagmem (
117         .sys_clk(sys_clk),
118         .ce(ram_ce),
119
120         .a1(indexa[cache_depth-1:5]),
121         .d1(tagmem_d1),
122         .a2(indexb[cache_depth-1:5]),
123         .d2(tagmem_d2),
124         .a3(indexc[cache_depth-1:5]),
125         .d3(tagmem_d3),
126         .a4(indexd[cache_depth-1:5]),
127         .d4(tagmem_d4),
128
129         .we(tagmem_we),
130         .aw(tagmem_aw),
131         .dw(tagmem_dw)
132 );
133
134 /* REQUEST TRACKER */
135 reg invalidate_req;
136 wire rqvalid_0 = pipe_stb_i & ~invalidate_req;
137 wire [fml_depth-1-1:0] dadr_0 = dadr;
138 wire [5:0] x_frac_0 = x_frac;
139 wire [5:0] y_frac_0 = y_frac;
140 wire [fml_depth-1:0] tadra8_0 = {tadra, 1'b0};
141 wire [fml_depth-1:0] tadrb8_0 = {tadrb, 1'b0};
142 wire [fml_depth-1:0] tadrc8_0 = {tadrc, 1'b0};
143 wire [fml_depth-1:0] tadrd8_0 = {tadrd, 1'b0};
144
145 reg rqvalid_1;
146 reg [fml_depth-1-1:0] dadr_1;
147 reg [5:0] x_frac_1;
148 reg [5:0] y_frac_1;
149 reg [fml_depth-1:0] tadra8_1;
150 reg [fml_depth-1:0] tadrb8_1;
151 reg [fml_depth-1:0] tadrc8_1;
152 reg [fml_depth-1:0] tadrd8_1;
153
154 reg rqvalid_2;
155 reg [fml_depth-1-1:0] dadr_2;
156 reg [5:0] x_frac_2;
157 reg [5:0] y_frac_2;
158 reg ignore_b_2;
159 reg ignore_c_2;
160 reg ignore_d_2;
161 reg [fml_depth-1:0] tadra8_2;
162 reg [fml_depth-1:0] tadrb8_2;
163 reg [fml_depth-1:0] tadrc8_2;
164 reg [fml_depth-1:0] tadrd8_2;
165
166 wire rqt_ce;
167
168 always @(posedge sys_clk) begin
169         if(sys_rst) begin
170                 rqvalid_1 <= 1'b0;
171                 rqvalid_2 <= 1'b0;
172         end else begin
173                 if(rqt_ce) begin
174                         rqvalid_1 <= rqvalid_0;
175                         dadr_1 <= dadr_0;
176                         x_frac_1 <= x_frac_0;
177                         y_frac_1 <= y_frac_0;
178                         tadra8_1 <= tadra8_0;
179                         tadrb8_1 <= tadrb8_0;
180                         tadrc8_1 <= tadrc8_0;
181                         tadrd8_1 <= tadrd8_0;
182
183                         rqvalid_2 <= rqvalid_1;
184                         dadr_2 <= dadr_1;
185                         x_frac_2 <= x_frac_1;
186                         y_frac_2 <= y_frac_1;
187                         ignore_b_2 <= x_frac_1 == 6'd0;
188                         ignore_c_2 <= y_frac_1 == 6'd0;
189                         ignore_d_2 <= (x_frac_1 == 6'd0) | (y_frac_1 == 6'd0);
190                         tadra8_2 <= tadra8_1;
191                         tadrb8_2 <= tadrb8_1;
192                         tadrc8_2 <= tadrc8_1;
193                         tadrd8_2 <= tadrd8_1;
194                 end
195         end
196 end
197
198 /* OUTPUT DATA GENERATOR */
199 assign dadr_f = dadr_2;
200 assign x_frac_f = x_frac_2;
201 assign y_frac_f = y_frac_2;
202
203 assign tcolora = tadra8_2[1] ? datamem_d1[15:0] : datamem_d1[31:16];
204 assign tcolorb = tadrb8_2[1] ? datamem_d2[15:0] : datamem_d2[31:16];
205 assign tcolorc = tadrc8_2[1] ? datamem_d3[15:0] : datamem_d3[31:16];
206 assign tcolord = tadrd8_2[1] ? datamem_d4[15:0] : datamem_d4[31:16];
207
208 /* INDEX GENERATOR */
209 reg index_sel;
210
211 assign indexa = index_sel ? tadra8_2 : tadra8_0;
212 assign indexb = index_sel ? tadrb8_2 : tadrb8_0;
213 assign indexc = index_sel ? tadrc8_2 : tadrc8_0;
214 assign indexd = index_sel ? tadrd8_2 : tadrd8_0;
215
216 /* HIT DETECTION */
217 wire valid_a = tagmem_d1[1+fml_depth-cache_depth-1];
218 wire [fml_depth-1-cache_depth:0] tag_a = tagmem_d1[fml_depth-cache_depth-1:0];
219 wire valid_b = tagmem_d2[1+fml_depth-cache_depth-1];
220 wire [fml_depth-1-cache_depth:0] tag_b = tagmem_d2[fml_depth-cache_depth-1:0];
221 wire valid_c = tagmem_d3[1+fml_depth-cache_depth-1];
222 wire [fml_depth-1-cache_depth:0] tag_c = tagmem_d3[fml_depth-cache_depth-1:0];
223 wire valid_d = tagmem_d4[1+fml_depth-cache_depth-1];
224 wire [fml_depth-1-cache_depth:0] tag_d = tagmem_d4[fml_depth-cache_depth-1:0];
225
226 wire hit_a = valid_a & (tag_a == tadra8_2[fml_depth-1:cache_depth]);
227 wire hit_b = ignore_b_2 | (valid_b & (tag_b == tadrb8_2[fml_depth-1:cache_depth]));
228 wire hit_c = ignore_c_2 | (valid_c & (tag_c == tadrc8_2[fml_depth-1:cache_depth]));
229 wire hit_d = ignore_d_2 | (valid_d & (tag_d == tadrd8_2[fml_depth-1:cache_depth]));
230
231 `ifdef VERIFY_TEXCACHE
232 integer x, y;
233 reg [15:0] expected;
234 always @(posedge sys_clk) begin
235         if(pipe_stb_o & pipe_ack_i) begin
236                 x = (tadra8_2/2) % 512;
237                 y = (tadra8_2/2) / 512;
238                 $image_get(0, x, y, expected);
239                 if(tcolora !== expected) begin
240                         $display("CACHE TEST FAILED [A]! (%d, %d): expected %x, got %x", x, y, expected, tcolora);
241                         $finish;
242                 end
243                 if(~ignore_b_2) begin
244                         x = (tadrb8_2/2) % 512;
245                         y = (tadrb8_2/2) / 512;
246                         $image_get(0, x, y, expected);
247                         if(tcolorb !== expected) begin
248                                 $display("CACHE TEST FAILED [B]! (%d, %d): expected %x, got %x", x, y, expected, tcolorb);
249                                 $finish;
250                         end
251                 end
252                 if(~ignore_c_2) begin
253                         x = (tadrc8_2/2) % 512;
254                         y = (tadrc8_2/2) / 512;
255                         $image_get(0, x, y, expected);
256                         if(tcolorc !== expected) begin
257                                 $display("CACHE TEST FAILED [C]! (%d, %d): expected %x, got %x", x, y, expected, tcolorc);
258                                 $finish;
259                         end
260                 end
261                 if(~ignore_d_2) begin
262                         x = (tadrd8_2/2) % 512;
263                         y = (tadrd8_2/2) / 512;
264                         $image_get(0, x, y, expected);
265                         if(tcolord !== expected) begin
266                                 $display("CACHE TEST FAILED [D]! (%d, %d): expected %x, got %x", x, y, expected, tcolord);
267                                 $finish;
268                         end
269                 end
270         end
271 end
272 `endif
273
274 /* FLUSH & MISS HANDLING */
275 reg [fml_depth-1:0] fetch_adr;
276 reg fetch_adr_ce;
277
278 always @(posedge sys_clk) begin
279         if(fetch_adr_ce) begin
280                 if(~hit_a)
281                         fetch_adr <= tadra8_2;
282                 else if(~hit_b)
283                         fetch_adr <= tadrb8_2;
284                 else if(~hit_c)
285                         fetch_adr <= tadrc8_2;
286                 else if(~hit_d)
287                         fetch_adr <= tadrd8_2;
288         end
289 end
290
291 reg flush_mode;
292 wire flush_done;
293 reg [cache_depth-1-5:0] flush_counter;
294 always @(posedge sys_clk) begin
295         if(flush_mode)
296                 flush_counter <= flush_counter + 1'd1;
297         else
298                 flush_counter <= {cache_depth-5{1'b0}};
299 end
300 assign flush_done = &flush_counter;
301
302 reg write_valid;
303 assign tagmem_aw = flush_mode ? flush_counter : fetch_adr[cache_depth-1:5];
304 assign tagmem_dw = {write_valid, fetch_adr[fml_depth-1:cache_depth]};
305
306 reg [1:0] burst_counter;
307 assign datamem_aw = {fetch_adr[cache_depth-1:5], burst_counter};
308
309 assign fml_adr = {fetch_adr[fml_depth-1:5], 5'd0};
310
311 /* FSM-BASED CONTROLLER */
312 reg [3:0] state;
313 reg [3:0] next_state;
314
315 parameter IDLE          = 4'd0;
316 parameter DATA1         = 4'd1;
317 parameter DATA2         = 4'd2;
318 parameter DATA3         = 4'd3;
319 parameter DATA4         = 4'd4;
320 parameter HANDLED_MISS0 = 4'd5;
321 parameter HANDLED_MISS1 = 4'd6;
322 parameter HANDLED_MISS  = 4'd7;
323 parameter FLUSHPIPE1    = 4'd8;
324 parameter FLUSHPIPE2    = 4'd9;
325 parameter FLUSH         = 4'd10;
326
327 always @(posedge sys_clk) begin
328         if(sys_rst)
329                 state <= IDLE;
330         else
331                 state <= next_state;
332 end
333
334 assign rqt_ce = pipe_ack_o | invalidate_req;
335
336 always @(*) begin
337         next_state = state;
338
339         tagmem_we = 1'b0;
340         write_valid = 1'b1;
341
342         datamem_we = 1'b0;
343         burst_counter = 2'bx;
344
345         flush_mode = 1'b0;
346
347         fml_stb = 1'b0;
348
349         busy = 1'b1;
350         pipe_stb_o = 1'b0;
351         pipe_ack_o = 1'b0;
352
353         invalidate_req = 1'b0;
354         fetch_adr_ce = 1'b0;
355
356         index_sel = 1'b0;
357
358         ram_ce = 1'b1;
359
360         case(state)
361                 IDLE: begin
362                         busy = rqvalid_1|rqvalid_2;
363                         pipe_stb_o = rqvalid_2 & hit_a & hit_b & hit_c & hit_d;
364                         pipe_ack_o = ~rqvalid_2 | ((hit_a & hit_b & hit_c & hit_d) & pipe_ack_i);
365                         ram_ce = ~rqvalid_2 | ((hit_a & hit_b & hit_c & hit_d) & pipe_ack_i);
366                         fetch_adr_ce = 1'b1;
367                         if(rqvalid_2 & (~hit_a | ~hit_b | ~hit_c | ~hit_d)) begin
368                                 next_state = DATA1;
369                         end else if(flush)
370                                 next_state = FLUSH;
371                 end
372                 DATA1: begin
373                         index_sel = 1'b1;
374                         fml_stb = 1'b1;
375                         burst_counter = 2'd0;
376                         datamem_we = 1'b1;
377                         tagmem_we = 1'b1;
378                         if(fml_ack)
379                                 next_state = DATA2;
380                 end
381                 DATA2: begin
382                         index_sel = 1'b1;
383                         burst_counter = 2'd1;
384                         datamem_we = 1'b1;
385                         next_state = DATA3;
386                 end
387                 DATA3: begin
388                         index_sel = 1'b1;
389                         burst_counter = 2'd2;
390                         datamem_we = 1'b1;
391                         next_state = DATA4;
392                 end
393                 DATA4: begin
394                         index_sel = 1'b1;
395                         burst_counter = 2'd3;
396                         datamem_we = 1'b1;
397                         fetch_adr_ce = 1'b1;
398                         if(~hit_a | ~hit_b | ~hit_c | ~hit_d)
399                                 next_state = DATA1;
400                         else
401                                 next_state = HANDLED_MISS0;
402                 end
403                 /* wait for the written data to make its way through the pipelined RAM */
404                 HANDLED_MISS0: begin
405                         index_sel = 1'b1;
406                         next_state = HANDLED_MISS1;
407                 end
408                 HANDLED_MISS1: begin
409                         index_sel = 1'b1;
410                         next_state = HANDLED_MISS;
411                 end
412                 HANDLED_MISS: begin
413                         index_sel = 1'b1;
414                         pipe_stb_o = 1'b1;
415                         if(pipe_ack_i) begin
416                                 invalidate_req = 1'b1;
417                                 next_state = FLUSHPIPE1;
418                         end
419                 end
420                 FLUSHPIPE1: begin
421                         index_sel = 1'b1;
422                         next_state = FLUSHPIPE2;
423                 end
424                 FLUSHPIPE2: begin
425                         index_sel = 1'b1;
426                         next_state = IDLE;
427                 end
428                 FLUSH: begin
429                         tagmem_we = 1'b1;
430                         write_valid = 1'b0;
431                         flush_mode = 1'b1;
432                         if(flush_done)
433                                 next_state = IDLE;
434                 end
435         endcase
436 end
437
438 endmodule