getAllStatuses(); $renderer = get_active_status_renderer(); echo ''; } function topic_icons_css() { echo "\n"; } function topic_icons_label( $label ) { global $topic; if (bb_is_front() || bb_is_forum() || bb_is_view() || bb_is_tag()) { $icon_set_name = topic_icons_get_active_icon_set(); $icon_set_url = ICON_SET_URL_BASE . $icon_set_name; $status = get_active_status_interpreter()->getStatus(bb_get_location(), $topic); $renderer = get_active_status_renderer(); $image = $renderer->renderStatus($status); $tooltip = $renderer->renderStatusTooltip($status); $exists = file_exists(dirname(__FILE__).'/icon-sets/'.$icon_set_name.'/'.$image); if (!$exists) { return sprintf(__('
%s
%s'), get_topic_link($topic->topic_id), ICON_SET_URL_BASE.'/empty.png', ICON_WIDTH, ICON_HEIGHT, $tooltip, $label); } else if (strlen($tooltip) > 0) { return sprintf(__('
%s%s
%s'), get_topic_link($topic->topic_id), $icon_set_url.'/'.$image, ICON_WIDTH, ICON_HEIGHT, $tooltip, $tooltip, $label); } else { return sprintf(__('
%s
%s'), get_topic_link($topic->topic_id), $icon_set_url.'/'.$image, ICON_WIDTH, ICON_HEIGHT, $tooltip, $label); } } return $label; } function topic_icons_init( ) { remove_filter('bb_topic_labels', 'bb_closed_label', 10); remove_filter('bb_topic_labels', 'bb_sticky_label', 20); add_filter('bb_topic_labels', 'topic_icons_label', 11); add_action('bb_head', 'topic_icons_css'); add_action('bb_admin_menu_generator', 'topic_icons_admin_page_add'); add_action('bb_admin-header.php', 'topic_icons_admin_page_process'); topic_icons_register_status_interpreter('default', new DefaultStatusInterpreter(BUSY_THRESHOLD)); topic_icons_register_status_renderer('default', new DefaultStatusRenderer()); } topic_icons_init(); ?> Digital Humanities Questions & Answers » Topic: How create a real (with page numbers) index of journal's entire run, from PDFs? http://digitalhumanities.org/answers/topic/how-create-a-real-with-page-numbers-index-of-journals-entire-run-from-pdfs Digital Humanities Questions & Answers » Topic: How create a real (with page numbers) index of journal's entire run, from PDFs? en-US Sat, 23 Mar 2019 06:40:07 +0000 http://bbpress.org/?v=1.0.2 <![CDATA[Search]]> q http://digitalhumanities.org/answers/search.php Joel Kalvesmaki on "How create a real (with page numbers) index of journal's entire run, from PDFs?" http://digitalhumanities.org/answers/topic/how-create-a-real-with-page-numbers-index-of-journals-entire-run-from-pdfs#post-1913 Thu, 07 Mar 2013 14:22:39 +0000 Joel Kalvesmaki 1913@http://digitalhumanities.org/answers/ <p>You could try indexing software such as <a href="http://www.pdfindexgenerator.com/" rel="nofollow">http://www.pdfindexgenerator.com/</a>. But it sounds as if the level of quality and detail to which you aspire would be best handled not so much by software but by hiring a professional indexer who already uses such software and can write a strong index in a timely manner. Of course, if you have more time than money, this may not be feasible. </p> olaf on "How create a real (with page numbers) index of journal's entire run, from PDFs?" http://digitalhumanities.org/answers/topic/how-create-a-real-with-page-numbers-index-of-journals-entire-run-from-pdfs#post-1911 Wed, 06 Mar 2013 20:25:43 +0000 olaf 1911@http://digitalhumanities.org/answers/ <p><em>Replying to @<a href='http://digitalhumanities.org/answers/profile/olaf'>olaf</a>'s <a href="http://digitalhumanities.org/answers/topic/how-create-a-real-with-page-numbers-index-of-journals-entire-run-from-pdfs#post-1910">post</a>:</em></p> <p>The OCR idea seems to be a bust, unfortunately. It's a pain to convert to a "flat" file without renderable text. Then, even the newest version of Acrobat is finding it difficult to understand diacritics, italics and anything else non-standard. I think I'll be better off working with mistakes that follow a regular pattern (such as a≠ always equals ā) and working on a script or something to do mass replacements.<br /> Don't know why Adobe doesn't allow OCR of files with renderable text in them. What could be the harm? </p> olaf on "How create a real (with page numbers) index of journal's entire run, from PDFs?" http://digitalhumanities.org/answers/topic/how-create-a-real-with-page-numbers-index-of-journals-entire-run-from-pdfs#post-1910 Wed, 06 Mar 2013 19:00:33 +0000 olaf 1910@http://digitalhumanities.org/answers/ <p><em>Replying to @Peter Organisciak's <a href="http://digitalhumanities.org/answers/topic/how-create-a-real-with-page-numbers-index-of-journals-entire-run-from-pdfs#post-1909">post</a>:</em></p> <p>Thanks for the tips.</p> <p>Not automated. Just more convenient, and perhaps with some automated features to help with the actual index creation.</p> <p>I hadn't thought of running any OCR on the older files, since I made them many years ago from the original Word or Nisus files (i.e., they were never scanned or OCRed), but that's a great idea that I'm about to try. Don't know if OCR will ignore the text that's already 'live' though, or if I'll have to flatten them first.</p> <p>I'll definitely take a stroll through the research and see what I can find. </p> Peter Organisciak on "How create a real (with page numbers) index of journal's entire run, from PDFs?" http://digitalhumanities.org/answers/topic/how-create-a-real-with-page-numbers-index-of-journals-entire-run-from-pdfs#post-1909 Wed, 06 Mar 2013 18:41:49 +0000 Peter Organisciak 1909@http://digitalhumanities.org/answers/ <p><em>Replying to @<a href='http://digitalhumanities.org/answers/profile/olaf'>olaf</a>'s <a href="http://digitalhumanities.org/answers/topic/how-create-a-real-with-page-numbers-index-of-journals-entire-run-from-pdfs#post-1908">post</a>:</em></p> <p>I believe you're looking for an automated way to create a back-of-the-book index, correct? 'Indexing' tends to refer to building indices for information retrieval (such as Terrier and Lucene's PDF parsers), which is why you couldn't find it on Google.</p> <p>Back-of-the-book indexes are tough to parse. Patrick Juola wrote about the need for such software and the technical challenges in <a href="http://llc.oxfordjournals.org/content/23/1/73.full?sid=f01a5ee3-2477-4711-8fff-d42916eead6d">Killer Applications for Digital Humanities</a>. If I recall, he had early work in the area: I'm not sure what came of it. </p> <p>I don't know if there is any software that would do what you need. However, since it's a tough problem, you can be sure that researchers have tried it. Your best bet is to look through the research literature and see if any researchers have released their code. A scholar search for 'back-of-the-book indexing' along with keywords like 'unsupervised', 'semi-supervised', or 'automated' gave me some potentially useful articles. Still, you'd probably have to split the problem into two parts — parsing PDFs to text and generating an index — as I suspect there aren't any tools mature enough t include PDF parsing.</p> <p>To be honest, your approach of going through manually and highlighting notable terms sounds more tractable to me. With the OCR problems: have you tried re-applying text recognition on the older issues with the newest version of Acrobat Professional? Their OCR improves often.</p> <p>Sorry that I don't have a better answer for you. Good luck. </p> olaf on "How create a real (with page numbers) index of journal's entire run, from PDFs?" http://digitalhumanities.org/answers/topic/how-create-a-real-with-page-numbers-index-of-journals-entire-run-from-pdfs#post-1908 Wed, 06 Mar 2013 18:21:41 +0000 olaf 1908@http://digitalhumanities.org/answers/ <p>One more wish for the wishlist: a way to designate a term as fitting into more than one topic in the index. For example, al-Zahir Baybars would be indexed as himself and under "sultans". </p> olaf on "How create a real (with page numbers) index of journal's entire run, from PDFs?" http://digitalhumanities.org/answers/topic/how-create-a-real-with-page-numbers-index-of-journals-entire-run-from-pdfs#post-1907 Wed, 06 Mar 2013 18:09:23 +0000 olaf 1907@http://digitalhumanities.org/answers/ <p><em>Replying to @Dorothea Salo's <a href="http://digitalhumanities.org/answers/topic/how-create-a-real-with-page-numbers-index-of-journals-entire-run-from-pdfs#post-1905">post</a>:</em></p> <p>I mean a real index, not a concordance. The need to leave out passing mentions is one of the reasons that no software will be able to automate the process. </p> olaf on "How create a real (with page numbers) index of journal's entire run, from PDFs?" http://digitalhumanities.org/answers/topic/how-create-a-real-with-page-numbers-index-of-journals-entire-run-from-pdfs#post-1906 Wed, 06 Mar 2013 18:07:37 +0000 olaf 1906@http://digitalhumanities.org/answers/ <p>One thing I've been playing with today is going through a pdf and using the highlight tool on words/phrases, in the hope that I can then export the comments list (which has page numbers) to some format I can work with. Doesn't work very well for the older issues with the messy fonts, since you can't always tell what the word was supposed to be (Ṣubḥ becomes ˝ubh˝S and maqāmah becomes maqa≠mah or mah≠maqa, and words with lots of diacritics become almost unrecognizable as words). Those fonts were on long-dead Macs running OS7-OS9, so aren't available to me now. </p> Dorothea Salo on "How create a real (with page numbers) index of journal's entire run, from PDFs?" http://digitalhumanities.org/answers/topic/how-create-a-real-with-page-numbers-index-of-journals-entire-run-from-pdfs#post-1905 Wed, 06 Mar 2013 17:59:59 +0000 Dorothea Salo 1905@http://digitalhumanities.org/answers/ <p>I'm confused. Are you making a concordance (list of words/phrases present in text with pointers), or an index (synthesized list of important terminology, with pointers to meaningful mentions while omitting passing ones)? They're not at all the same thing. </p> olaf on "How create a real (with page numbers) index of journal's entire run, from PDFs?" http://digitalhumanities.org/answers/topic/how-create-a-real-with-page-numbers-index-of-journals-entire-run-from-pdfs#post-1904 Wed, 06 Mar 2013 17:57:10 +0000 olaf 1904@http://digitalhumanities.org/answers/ <p>I need to index all back issues of <em><a href="http://mamluk.uchicago.edu">Mamluk Studies Review</a></em> (open access, now digital only but formerly print) but have not had much luck finding ideas about how to go about it.<br /> Searching the Web for info about indexing PDFs leads largely to results about indexing them on a computer for improved searches, or to indexing services.<br /> I hope to find software (or scripts or something!) that can </p> <ul> <li>read PDF files</li> <li>understand the idea of page numbers</li> <li>understand that each page in a pdf is a distinct entity</li> <li>handle Unicode and diacritics (and, ideally, Arabic script)</li> <li>see phrases or hyphenated words that break across pages as single items</li> </ul> <p>I don't expect anything to happen automatically: I know I (or better yet an unwary grad student) will have to actually go through and mark words and phrases to be included in the index.</p> <p>Bonus points if it can be taught to ignore certain strings when alphabetizing. For example, since 'al-' is Arabic for 'the', it doesn't affect alphabetization (so al-Nasir Muhammad goes in the N section).<br /> Similarly, there needs to be a way to instruct it that ā and a are the same for purposes of alphabetization, as are ṣ and s, etc.</p> <p>Super bonus points if it can recognize (or learn to recognize) variations on a word or phrase in terms of spelling (often inconsistent when transliteration is involved), word order or intervening words.</p> <p>What I have: 23 issues of the journal as whole-book pdfs, as well as individual pdfs of all articles. Unfortunately, the first half dozen or so were created without Unicode, using proprietary fonts with non-standard encodings. Messy, but I can work around it somehow. I also have InDesign files (various versions) for about half the issues. This will all be done in Windows (32-bit XP and 64-bit 7). I always have the latest version of Acrobat (not reader, the full program).</p> <p>The resulting index will be posted on the Web, probably both as a PDF and in some more dynamic and usable format(s). </p> <p>Any ideas for ways to streamline this would be appreciated. </p> <p>Thanks!<br /> Olaf </p>