Home Reference Source

src/utils/mp4-tools.ts

  1. import { sliceUint8 } from './typed-array';
  2. import { ElementaryStreamTypes } from '../loader/fragment';
  3.  
  4. type Mp4BoxData = {
  5. data: Uint8Array;
  6. start: number;
  7. end: number;
  8. };
  9.  
  10. const UINT32_MAX = Math.pow(2, 32) - 1;
  11. const push = [].push;
  12.  
  13. export function bin2str(data: Uint8Array): string {
  14. return String.fromCharCode.apply(null, data);
  15. }
  16.  
  17. export function readUint16(
  18. buffer: Uint8Array | Mp4BoxData,
  19. offset: number
  20. ): number {
  21. if ('data' in buffer) {
  22. offset += buffer.start;
  23. buffer = buffer.data;
  24. }
  25.  
  26. const val = (buffer[offset] << 8) | buffer[offset + 1];
  27.  
  28. return val < 0 ? 65536 + val : val;
  29. }
  30.  
  31. export function readUint32(
  32. buffer: Uint8Array | Mp4BoxData,
  33. offset: number
  34. ): number {
  35. if ('data' in buffer) {
  36. offset += buffer.start;
  37. buffer = buffer.data;
  38. }
  39.  
  40. const val =
  41. (buffer[offset] << 24) |
  42. (buffer[offset + 1] << 16) |
  43. (buffer[offset + 2] << 8) |
  44. buffer[offset + 3];
  45. return val < 0 ? 4294967296 + val : val;
  46. }
  47.  
  48. export function writeUint32(
  49. buffer: Uint8Array | Mp4BoxData,
  50. offset: number,
  51. value: number
  52. ) {
  53. if ('data' in buffer) {
  54. offset += buffer.start;
  55. buffer = buffer.data;
  56. }
  57. buffer[offset] = value >> 24;
  58. buffer[offset + 1] = (value >> 16) & 0xff;
  59. buffer[offset + 2] = (value >> 8) & 0xff;
  60. buffer[offset + 3] = value & 0xff;
  61. }
  62.  
  63. // Find the data for a box specified by its path
  64. export function findBox(
  65. input: Uint8Array | Mp4BoxData,
  66. path: Array<string>
  67. ): Array<Mp4BoxData> {
  68. const results = [] as Array<Mp4BoxData>;
  69. if (!path.length) {
  70. // short-circuit the search for empty paths
  71. return results;
  72. }
  73.  
  74. let data: Uint8Array;
  75. let start;
  76. let end;
  77. if ('data' in input) {
  78. data = input.data;
  79. start = input.start;
  80. end = input.end;
  81. } else {
  82. data = input;
  83. start = 0;
  84. end = data.byteLength;
  85. }
  86.  
  87. for (let i = start; i < end; ) {
  88. const size = readUint32(data, i);
  89. const type = bin2str(data.subarray(i + 4, i + 8));
  90. const endbox = size > 1 ? i + size : end;
  91.  
  92. if (type === path[0]) {
  93. if (path.length === 1) {
  94. // this is the end of the path and we've found the box we were
  95. // looking for
  96. results.push({ data: data, start: i + 8, end: endbox });
  97. } else {
  98. // recursively search for the next box along the path
  99. const subresults = findBox(
  100. { data: data, start: i + 8, end: endbox },
  101. path.slice(1)
  102. );
  103. if (subresults.length) {
  104. push.apply(results, subresults);
  105. }
  106. }
  107. }
  108. i = endbox;
  109. }
  110.  
  111. // we've finished searching all of data
  112. return results;
  113. }
  114.  
  115. type SidxInfo = {
  116. earliestPresentationTime: number;
  117. timescale: number;
  118. version: number;
  119. referencesCount: number;
  120. references: any[];
  121. moovEndOffset: number | null;
  122. };
  123.  
  124. export function parseSegmentIndex(initSegment: Uint8Array): SidxInfo | null {
  125. const moovBox = findBox(initSegment, ['moov']);
  126. const moov = moovBox[0];
  127. const moovEndOffset = moov ? moov.end : null; // we need this in case we need to chop of garbage of the end of current data
  128.  
  129. const sidxBox = findBox(initSegment, ['sidx']);
  130.  
  131. if (!sidxBox || !sidxBox[0]) {
  132. return null;
  133. }
  134.  
  135. const references: any[] = [];
  136. const sidx = sidxBox[0];
  137.  
  138. const version = sidx.data[0];
  139.  
  140. // set initial offset, we skip the reference ID (not needed)
  141. let index = version === 0 ? 8 : 16;
  142.  
  143. const timescale = readUint32(sidx, index);
  144. index += 4;
  145.  
  146. // TODO: parse earliestPresentationTime and firstOffset
  147. // usually zero in our case
  148. const earliestPresentationTime = 0;
  149. const firstOffset = 0;
  150.  
  151. if (version === 0) {
  152. index += 8;
  153. } else {
  154. index += 16;
  155. }
  156.  
  157. // skip reserved
  158. index += 2;
  159.  
  160. let startByte = sidx.end + firstOffset;
  161.  
  162. const referencesCount = readUint16(sidx, index);
  163. index += 2;
  164.  
  165. for (let i = 0; i < referencesCount; i++) {
  166. let referenceIndex = index;
  167.  
  168. const referenceInfo = readUint32(sidx, referenceIndex);
  169. referenceIndex += 4;
  170.  
  171. const referenceSize = referenceInfo & 0x7fffffff;
  172. const referenceType = (referenceInfo & 0x80000000) >>> 31;
  173.  
  174. if (referenceType === 1) {
  175. // eslint-disable-next-line no-console
  176. console.warn('SIDX has hierarchical references (not supported)');
  177. return null;
  178. }
  179.  
  180. const subsegmentDuration = readUint32(sidx, referenceIndex);
  181. referenceIndex += 4;
  182.  
  183. references.push({
  184. referenceSize,
  185. subsegmentDuration, // unscaled
  186. info: {
  187. duration: subsegmentDuration / timescale,
  188. start: startByte,
  189. end: startByte + referenceSize - 1,
  190. },
  191. });
  192.  
  193. startByte += referenceSize;
  194.  
  195. // Skipping 1 bit for |startsWithSap|, 3 bits for |sapType|, and 28 bits
  196. // for |sapDelta|.
  197. referenceIndex += 4;
  198.  
  199. // skip to next ref
  200. index = referenceIndex;
  201. }
  202.  
  203. return {
  204. earliestPresentationTime,
  205. timescale,
  206. version,
  207. referencesCount,
  208. references,
  209. moovEndOffset,
  210. };
  211. }
  212.  
  213. /**
  214. * Parses an MP4 initialization segment and extracts stream type and
  215. * timescale values for any declared tracks. Timescale values indicate the
  216. * number of clock ticks per second to assume for time-based values
  217. * elsewhere in the MP4.
  218. *
  219. * To determine the start time of an MP4, you need two pieces of
  220. * information: the timescale unit and the earliest base media decode
  221. * time. Multiple timescales can be specified within an MP4 but the
  222. * base media decode time is always expressed in the timescale from
  223. * the media header box for the track:
  224. * ```
  225. * moov > trak > mdia > mdhd.timescale
  226. * moov > trak > mdia > hdlr
  227. * ```
  228. * @param initSegment {Uint8Array} the bytes of the init segment
  229. * @return {InitData} a hash of track type to timescale values or null if
  230. * the init segment is malformed.
  231. */
  232.  
  233. export interface InitDataTrack {
  234. timescale: number;
  235. id: number;
  236. codec: string;
  237. }
  238.  
  239. type HdlrType = ElementaryStreamTypes.AUDIO | ElementaryStreamTypes.VIDEO;
  240.  
  241. export interface InitData extends Array<any> {
  242. [index: number]:
  243. | {
  244. timescale: number;
  245. type: HdlrType;
  246. default?: {
  247. duration: number;
  248. flags: number;
  249. };
  250. }
  251. | undefined;
  252. audio?: InitDataTrack;
  253. video?: InitDataTrack;
  254. }
  255.  
  256. export function parseInitSegment(initSegment: Uint8Array): InitData {
  257. const result: InitData = [];
  258. const traks = findBox(initSegment, ['moov', 'trak']);
  259. for (let i = 0; i < traks.length; i++) {
  260. const trak = traks[i];
  261. const tkhd = findBox(trak, ['tkhd'])[0];
  262. if (tkhd) {
  263. let version = tkhd.data[tkhd.start];
  264. let index = version === 0 ? 12 : 20;
  265. const trackId = readUint32(tkhd, index);
  266. const mdhd = findBox(trak, ['mdia', 'mdhd'])[0];
  267. if (mdhd) {
  268. version = mdhd.data[mdhd.start];
  269. index = version === 0 ? 12 : 20;
  270. const timescale = readUint32(mdhd, index);
  271. const hdlr = findBox(trak, ['mdia', 'hdlr'])[0];
  272. if (hdlr) {
  273. const hdlrType = bin2str(
  274. hdlr.data.subarray(hdlr.start + 8, hdlr.start + 12)
  275. );
  276. const type: HdlrType | undefined = {
  277. soun: ElementaryStreamTypes.AUDIO as const,
  278. vide: ElementaryStreamTypes.VIDEO as const,
  279. }[hdlrType];
  280. if (type) {
  281. // Parse codec details
  282. const stsd = findBox(trak, ['mdia', 'minf', 'stbl', 'stsd'])[0];
  283. let codec;
  284. if (stsd) {
  285. codec = bin2str(
  286. stsd.data.subarray(stsd.start + 12, stsd.start + 16)
  287. );
  288. // TODO: Parse codec details to be able to build MIME type.
  289. // stsd.start += 8;
  290. // const codecBox = findBox(stsd, [codec])[0];
  291. // if (codecBox) {
  292. // TODO: Codec parsing support for avc1, mp4a, hevc, av01...
  293. // }
  294. }
  295. result[trackId] = { timescale, type };
  296. result[type] = { timescale, id: trackId, codec };
  297. }
  298. }
  299. }
  300. }
  301. }
  302.  
  303. const trex = findBox(initSegment, ['moov', 'mvex', 'trex']);
  304. trex.forEach((trex) => {
  305. const trackId = readUint32(trex, 4);
  306. const track = result[trackId];
  307. if (track) {
  308. track.default = {
  309. duration: readUint32(trex, 12),
  310. flags: readUint32(trex, 20),
  311. };
  312. }
  313. });
  314.  
  315. return result;
  316. }
  317.  
  318. /**
  319. * Determine the base media decode start time, in seconds, for an MP4
  320. * fragment. If multiple fragments are specified, the earliest time is
  321. * returned.
  322. *
  323. * The base media decode time can be parsed from track fragment
  324. * metadata:
  325. * ```
  326. * moof > traf > tfdt.baseMediaDecodeTime
  327. * ```
  328. * It requires the timescale value from the mdhd to interpret.
  329. *
  330. * @param initData {InitData} a hash of track type to timescale values
  331. * @param fmp4 {Uint8Array} the bytes of the mp4 fragment
  332. * @return {number} the earliest base media decode start time for the
  333. * fragment, in seconds
  334. */
  335. export function getStartDTS(initData: InitData, fmp4: Uint8Array): number {
  336. // we need info from two children of each track fragment box
  337. return (
  338. findBox(fmp4, ['moof', 'traf']).reduce((result: number | null, traf) => {
  339. const tfdt = findBox(traf, ['tfdt'])[0];
  340. const version = tfdt.data[tfdt.start];
  341. const start = findBox(traf, ['tfhd']).reduce(
  342. (result: number | null, tfhd) => {
  343. // get the track id from the tfhd
  344. const id = readUint32(tfhd, 4);
  345. const track = initData[id];
  346. if (track) {
  347. let baseTime = readUint32(tfdt, 4);
  348. if (version === 1) {
  349. baseTime *= Math.pow(2, 32);
  350. baseTime += readUint32(tfdt, 8);
  351. }
  352. // assume a 90kHz clock if no timescale was specified
  353. const scale = track.timescale || 90e3;
  354. // convert base time to seconds
  355. const startTime = baseTime / scale;
  356. if (
  357. isFinite(startTime) &&
  358. (result === null || startTime < result)
  359. ) {
  360. return startTime;
  361. }
  362. }
  363. return result;
  364. },
  365. null
  366. );
  367. if (
  368. start !== null &&
  369. isFinite(start) &&
  370. (result === null || start < result)
  371. ) {
  372. return start;
  373. }
  374. return result;
  375. }, null) || 0
  376. );
  377. }
  378.  
  379. /*
  380. For Reference:
  381. aligned(8) class TrackFragmentHeaderBox
  382. extends FullBox(‘tfhd’, 0, tf_flags){
  383. unsigned int(32) track_ID;
  384. // all the following are optional fields
  385. unsigned int(64) base_data_offset;
  386. unsigned int(32) sample_description_index;
  387. unsigned int(32) default_sample_duration;
  388. unsigned int(32) default_sample_size;
  389. unsigned int(32) default_sample_flags
  390. }
  391. */
  392. export function getDuration(data: Uint8Array, initData: InitData) {
  393. let rawDuration = 0;
  394. let videoDuration = 0;
  395. let audioDuration = 0;
  396. const trafs = findBox(data, ['moof', 'traf']);
  397. for (let i = 0; i < trafs.length; i++) {
  398. const traf = trafs[i];
  399. // There is only one tfhd & trun per traf
  400. // This is true for CMAF style content, and we should perhaps check the ftyp
  401. // and only look for a single trun then, but for ISOBMFF we should check
  402. // for multiple track runs.
  403. const tfhd = findBox(traf, ['tfhd'])[0];
  404. // get the track id from the tfhd
  405. const id = readUint32(tfhd, 4);
  406. const track = initData[id];
  407. if (!track) {
  408. continue;
  409. }
  410. const trackDefault = track.default;
  411. const tfhdFlags = readUint32(tfhd, 0) | trackDefault?.flags!;
  412. let sampleDuration: number | undefined = trackDefault?.duration;
  413. if (tfhdFlags & 0x000008) {
  414. // 0x000008 indicates the presence of the default_sample_duration field
  415. if (tfhdFlags & 0x000002) {
  416. // 0x000002 indicates the presence of the sample_description_index field, which precedes default_sample_duration
  417. // If present, the default_sample_duration exists at byte offset 12
  418. sampleDuration = readUint32(tfhd, 12);
  419. } else {
  420. // Otherwise, the duration is at byte offset 8
  421. sampleDuration = readUint32(tfhd, 8);
  422. }
  423. }
  424. // assume a 90kHz clock if no timescale was specified
  425. const timescale = track.timescale || 90e3;
  426. const truns = findBox(traf, ['trun']);
  427. for (let j = 0; j < truns.length; j++) {
  428. rawDuration = computeRawDurationFromSamples(truns[j]);
  429. if (!rawDuration && sampleDuration) {
  430. const sampleCount = readUint32(truns[j], 4);
  431. rawDuration = sampleDuration * sampleCount;
  432. }
  433. if (track.type === ElementaryStreamTypes.VIDEO) {
  434. videoDuration += rawDuration / timescale;
  435. } else if (track.type === ElementaryStreamTypes.AUDIO) {
  436. audioDuration += rawDuration / timescale;
  437. }
  438. }
  439. }
  440. if (videoDuration === 0 && audioDuration === 0) {
  441. // If duration samples are not available in the traf use sidx subsegment_duration
  442. const sidx = parseSegmentIndex(data);
  443. if (sidx?.references) {
  444. return sidx.references.reduce(
  445. (dur, ref) => dur + ref.info.duration || 0,
  446. 0
  447. );
  448. }
  449. }
  450. if (videoDuration) {
  451. return videoDuration;
  452. }
  453. return audioDuration;
  454. }
  455.  
  456. /*
  457. For Reference:
  458. aligned(8) class TrackRunBox
  459. extends FullBox(‘trun’, version, tr_flags) {
  460. unsigned int(32) sample_count;
  461. // the following are optional fields
  462. signed int(32) data_offset;
  463. unsigned int(32) first_sample_flags;
  464. // all fields in the following array are optional
  465. {
  466. unsigned int(32) sample_duration;
  467. unsigned int(32) sample_size;
  468. unsigned int(32) sample_flags
  469. if (version == 0)
  470. { unsigned int(32)
  471. else
  472. { signed int(32)
  473. }[ sample_count ]
  474. }
  475. */
  476. export function computeRawDurationFromSamples(trun): number {
  477. const flags = readUint32(trun, 0);
  478. // Flags are at offset 0, non-optional sample_count is at offset 4. Therefore we start 8 bytes in.
  479. // Each field is an int32, which is 4 bytes
  480. let offset = 8;
  481. // data-offset-present flag
  482. if (flags & 0x000001) {
  483. offset += 4;
  484. }
  485. // first-sample-flags-present flag
  486. if (flags & 0x000004) {
  487. offset += 4;
  488. }
  489.  
  490. let duration = 0;
  491. const sampleCount = readUint32(trun, 4);
  492. for (let i = 0; i < sampleCount; i++) {
  493. // sample-duration-present flag
  494. if (flags & 0x000100) {
  495. const sampleDuration = readUint32(trun, offset);
  496. duration += sampleDuration;
  497. offset += 4;
  498. }
  499. // sample-size-present flag
  500. if (flags & 0x000200) {
  501. offset += 4;
  502. }
  503. // sample-flags-present flag
  504. if (flags & 0x000400) {
  505. offset += 4;
  506. }
  507. // sample-composition-time-offsets-present flag
  508. if (flags & 0x000800) {
  509. offset += 4;
  510. }
  511. }
  512. return duration;
  513. }
  514.  
  515. export function offsetStartDTS(
  516. initData: InitData,
  517. fmp4: Uint8Array,
  518. timeOffset: number
  519. ) {
  520. findBox(fmp4, ['moof', 'traf']).forEach(function (traf) {
  521. findBox(traf, ['tfhd']).forEach(function (tfhd) {
  522. // get the track id from the tfhd
  523. const id = readUint32(tfhd, 4);
  524. const track = initData[id];
  525. if (!track) {
  526. return;
  527. }
  528. // assume a 90kHz clock if no timescale was specified
  529. const timescale = track.timescale || 90e3;
  530. // get the base media decode time from the tfdt
  531. findBox(traf, ['tfdt']).forEach(function (tfdt) {
  532. const version = tfdt.data[tfdt.start];
  533. let baseMediaDecodeTime = readUint32(tfdt, 4);
  534. if (version === 0) {
  535. writeUint32(tfdt, 4, baseMediaDecodeTime - timeOffset * timescale);
  536. } else {
  537. baseMediaDecodeTime *= Math.pow(2, 32);
  538. baseMediaDecodeTime += readUint32(tfdt, 8);
  539. baseMediaDecodeTime -= timeOffset * timescale;
  540. baseMediaDecodeTime = Math.max(baseMediaDecodeTime, 0);
  541. const upper = Math.floor(baseMediaDecodeTime / (UINT32_MAX + 1));
  542. const lower = Math.floor(baseMediaDecodeTime % (UINT32_MAX + 1));
  543. writeUint32(tfdt, 4, upper);
  544. writeUint32(tfdt, 8, lower);
  545. }
  546. });
  547. });
  548. });
  549. }
  550.  
  551. // TODO: Check if the last moof+mdat pair is part of the valid range
  552. export function segmentValidRange(data: Uint8Array): SegmentedRange {
  553. const segmentedRange: SegmentedRange = {
  554. valid: null,
  555. remainder: null,
  556. };
  557.  
  558. const moofs = findBox(data, ['moof']);
  559. if (!moofs) {
  560. return segmentedRange;
  561. } else if (moofs.length < 2) {
  562. segmentedRange.remainder = data;
  563. return segmentedRange;
  564. }
  565. const last = moofs[moofs.length - 1];
  566. // Offset by 8 bytes; findBox offsets the start by as much
  567. segmentedRange.valid = sliceUint8(data, 0, last.start - 8);
  568. segmentedRange.remainder = sliceUint8(data, last.start - 8);
  569. return segmentedRange;
  570. }
  571.  
  572. export interface SegmentedRange {
  573. valid: Uint8Array | null;
  574. remainder: Uint8Array | null;
  575. }
  576.  
  577. export function appendUint8Array(
  578. data1: Uint8Array,
  579. data2: Uint8Array
  580. ): Uint8Array {
  581. const temp = new Uint8Array(data1.length + data2.length);
  582. temp.set(data1);
  583. temp.set(data2, data1.length);
  584.  
  585. return temp;
  586. }